# Análisis de los usuarios I

In [40]:
from py2neo import Graph, Relationship, NodeMatcher, Node
from multiprocessing import Process
from urllib.parse import urlparse
from datetime import datetime
from functools import partial
import tweepy
import json
import pandas as pd
import re
import csv
import random
import numpy as np
import networkx as nx
import collections

auth_tweepy = {"access_secret" : "-",
               "consumer_secret" : "-",
               "consumer_key" : "-",
               "access_token" : "-"}

auth_neo4j = {"host" : "http://localhost:7474",
              "password" : "-"}

## Recogida de los tweets de los usuarios

Obtenemos del fichero tomado en diciembre de 2020 filtrando los usuarios del dataset 'labeled' de *HaterNet*. En total son **3600 usuarios** los que a esa fecha estaban disponibles, por lo que es posible que hoy haya algunos usuarios suspendidos más.

In [38]:
df = pd.read_csv('Dataset/full_data/tweet_hr_label_6K.csv')
authors = df.author_id.tolist()
len(set(authors)) # quitamos las posibles repeticiones de los usuarios

3600

## Recogida de los tweets de los usuarios 2.0
La red social previa no era muy fuerte por lo que se procede a recoger más tweets procedentes de los recogidos individualmente como HS, y buscaremos en sus contactos.

In [34]:
df = pd.read_csv('datos_final_743/usuarios_caracteristicas_finales.csv')
authors = df.user_id.tolist()

len(set(authors))

743

Generamos una recogida de 200 tweets reciente por cada uno de estos usuarios.

In [35]:
## Neo4j Schema
repl = partial(re.sub, '( |\n|\t)+', ' ')

def neo4j_tweet(tweet):
    return Node("Tweet", content=tweet)

def neo4j_multimedia(multimedia):
    return Node("Multimedia", url=multimedia[0], netloc=multimedia[1], path=multimedia[2])

def neo4j_vuser(user):
    return Node("User", id=user, virtual='T')

def neo4j_user(user, n):
    return Node("User", 
                id=user.id, 
                uname=user.name,
                virtual='F',
                screen_name=user.screen_name,
                description=repl(user.description),
                location=user.location,
                profile_image_url=user.profile_image_url,
                default_profile=user.default_profile,
                default_profile_image=user.default_profile_image,
                geo_enabled=user.geo_enabled,
                created_at=user.created_at.timestamp(),
                verified=user.verified,
                statuses_count=user.statuses_count, 
                followers_count=user.followers_count,
                followees_count=user.friends_count,
                favorites_count=user.favourites_count,
                listed_count=user.listed_count,
                number=n)

def neo4j_save_info(bd_user, user, n):
    bd_user['n'] = n
    bd_user['uname'] = user.name
    bd_user['virtual'] = 'F'
    bd_user['screen_name'] = user.screen_name
    bd_user['description'] = user.description
    bd_user['location'] = user.location
    bd_user['profile_image_url'] = user.profile_image_url
    bd_user['default_profile'] = user.default_profile
    bd_user['default_profile_image'] = user.default_profile_image
    bd_user['geo_enabled'] = user.geo_enabled
    bd_user['created_at'] = user.created_at.timestamp()
    bd_user['verified'] = user.verified
    bd_user['statuses_count'] = user.statuses_count
    bd_user['followers_count'] = user.followers_count
    bd_user['followees_count'] = user.friends_count
    bd_user['favorites_count'] = user.favourites_count
    bd_user['listed_count'] = user.listed_count
    return bd_user

def neo4j_tweet_tostring(tweet):
    rt,qt,rp ='','',''
    if "retweeted_status" in tweet._json and tweet._json["retweeted_status"] is not None:
        rt = tweet._json["retweeted_status"]
    if "quoted_status" in tweet._json and tweet._json["quoted_status"] is not None:
        qt = tweet._json["quoted_status"]
    if "in_reply_to_screen_name" in tweet._json and tweet._json["in_reply_to_screen_name"] is not None:
        rp = tweet._json["in_reply_to_screen_name"]
        
    return '@@@'.join([str(tweet.id),
                     str('' if rt else repl(tweet.full_text)),
                     str(tweet.created_at.timestamp()),
                     str(tweet.favorite_count),
                     str(tweet.retweet_count),
                     str(rp),
                     str("" if not rp else tweet.in_reply_to_status_id),
                     str("" if not rp else tweet.in_reply_to_user_id),
                     str(qt),
                     str("" if not qt and not hasattr(tweet, 'quoted_status') else tweet.quoted_status.id),
                     str("" if not qt else tweet.quoted_status.user.id),
                     str("" if not qt else repl(tweet.quoted_status.full_text)),
                     str("" if not qt else datetime.strptime(str(tweet.quoted_status.created_at), "%Y-%m-%d %H:%M:%S").timestamp()),
                     str("" if not qt else tweet.quoted_status.favorite_count),
                     str("" if not qt else tweet.quoted_status.retweet_count),
                     str(rt),
                     str("" if not rt else tweet.retweeted_status.id),
                     str("" if not rt else tweet.retweeted_status.user.id),
                     str("" if not rt else repl(tweet.retweeted_status.full_text)),
                     str("" if not rt else tweet.retweeted_status.created_at.timestamp()),
                     str("" if not rt else tweet.retweeted_status.favorite_count),
                     str("" if not rt else tweet.retweeted_status.retweet_count)])



In [40]:
class Neo4jSaveUser():
    def __init__(self, auth_neo4j, auth_tweepy, next_node, n):
        self.api = Neo4jSaveUser.auth_tweepy(auth_tweepy)
        self.graph = Neo4jSaveUser.auth_neo4j(auth_neo4j)
        self.node_selector = NodeMatcher(self.graph)
        self.n = n
        self.next_node = None
        self.previous_node = None
        
        if self.graph.schema.get_uniqueness_constraints("Tweet") != ["id"]:
            self.graph.schema.create_uniqueness_constraint("Tweet", "id")
        if self.graph.schema.get_uniqueness_constraints("Multimedia") != ["url"]:
            self.graph.schema.create_uniqueness_constraint("Multimedia", "url")
        if self.graph.schema.get_uniqueness_constraints("User") != ["id"]:
            self.graph.schema.create_uniqueness_constraint("User", "id")
        if self.graph.schema.get_indexes("User"):
            print('Ya existe el usuario')
    
    @staticmethod       
    def auth_tweepy(auth):
        oauth = tweepy.OAuthHandler(auth['consumer_key'], auth['consumer_secret'])
        oauth.set_access_token(auth['access_token'], auth['access_secret'])
        return tweepy.API(oauth)
    
    @staticmethod
    def auth_neo4j(auth):
        return Graph(auth['host'], password=auth['password'])
    
    @staticmethod
    def tweet_urls(tweet):
        urls = []
        if 'urls' in tweet.entities:
            for url in tweet.entities['urls']:
                if 'expanded_url' in url and url['expanded_url'] is not None:
                    if urlparse(url['expanded_url']).netloc != "twitter.com":
                        tmp = urlparse(url['expanded_url'])
                        urls.append((tmp.netloc+tmp.path, tmp.netloc, tmp.path))
        return urls
    
    @staticmethod
    def tweet_quoted(tweet):
        if hasattr(tweet, 'quoted_status'):
            return [tweet._json['quoted_status']['user']['id']]
        return []
            
    @staticmethod
    def tweet_rt_users(tweet):
        if hasattr(tweet, 'retweeted_status'):
            return [tweet._json['retweeted_status']['user']['id']]
        return []
    
    def user_tweets(self, user):
        perfil = self.api.user_timeline(include_rts=True, count=200, trim_user=False, exclude_replies=False, user_id=user['id'], tweet_mode='extended')
        tweets, urls, quoted, retweeted = [],[],[],[]
        
        for t in perfil:
            tweets.append(neo4j_tweet_tostring(t))
            urls += Neo4jSaveUser.tweet_urls(t)
            quoted += Neo4jSaveUser.tweet_quoted(t)
            retweeted += Neo4jSaveUser.tweet_rt_users(t)
            
        tweet = neo4j_tweet(tweets)
        self.graph.create(tweet)
        self.graph.create(Relationship(user, "tweeted", tweet))
        
        for u in urls:
            url = neo4j_multimedia(u)
            self.graph.merge(url, "Multimedia", 'url')
            self.graph.create(Relationship(user, "shared", url))
        
        for node in set(retweeted).union(quoted):
            next_node = list(self.node_selector.match("User", id=node))

            if len(next_node) == 0:
                virtual_user = neo4j_vuser(node)
            elif len(next_node) == 1:
                virtual_user = next_node[0]
            else:
                raise Exception("Error: varios nodos con el mismo id")

            if node in retweeted:
                rel = Relationship(user, "retweeted", virtual_user)
                self.graph.merge(rel, "User", "id")
            if node in quoted:
                rel = Relationship(user, "quoted", virtual_user)
                self.graph.merge(rel, "User", "id")
        
    def push_node(self, node):
        user = neo4j_save_info(node, self.api.get_user(node['id']), self.n)
        self.n += 1
        self.graph.push(user)
        self.user_tweets(user)
        return user
    
    def get_adj(self):
        ret = []
        for rel in self.graph.match(nodes=(self.next_node,), r_type="retweeted"):
            print(rel)
            ret.append(rel.end_node)
        return ret
    
    def run(self):
        for author in authors:
            try:
                user = neo4j_user(self.api.get_user(author), self.n)
                self.graph.merge(user, "User", "id")
                self.user_tweets(user)
            except tweepy.TweepError as exception:
                print(exception)
                continue
            self.n += 1
            self.next_node = user
            if self.next_node:
                if self.next_node["virtual"] == "T":
                    try:
                        self.next_node = self.push_node(self.next_node)
                        print("(Metido {0}".format(self.next_node["screen_name"]), end=") ")
                    except tweepy.TweepError as exception:
                        print(exception)
                    except IOError as exception:
                        print(exception)
                        self.next_node = self.previous_node
                        continue
                else:
                    print("(Pasado con {0}".format(self.next_node["screen_name"]), end=") ")
            else:
                print('----------- no se ----------------')
                
            adj = self.get_adj()
            self.previous_node = self.next_node
            print("Usuario {0} pasado".format(author))
        

In [None]:
crawler = Neo4jSaveUser(auth_neo4j, auth_tweepy, None, 1)
crawler.run()

## Preprocesamiento

#### Guardado del grafo
Volcado de la base de datos recogida en Neo4j Desktop a **usuarios.graphml** 

In [43]:
grafonx = nx.DiGraph()
grafo = Graph(auth_neo4j['host'], password=auth_neo4j['password'])

for nodo in grafo.run(cypher ="""MATCH (a:User) WHERE a.virtual="F" RETURN a as val""").data():
    vn = dict((nodo['val']))
    grafonx.add_node(vn['id'], **vn)
    grafonx.add_edge(vn['id'], vn['id'])
for nodo in grafo.run(cypher ="""MATCH (a:User)-[:retweeted]->(b:User) WHERE a.virtual="F" AND b.virtual="F" RETURN a.id as a, b.id as b""").data():
    grafonx.add_edge(nodo['a'], nodo['b'])
    
nx.write_graphml(grafonx, "datos_final/usuarios.graphml")

In [44]:
#nx.draw(grafonx, pos=nx.spring_layout(grafonx))
g = nx.read_graphml("datos_final/usuarios.graphml")
len(set(list(g.nodes))) ## quitando elementos repetidos!!

742

Después de ejecutar la recogida de usuarios anterior, finalmente nos quedamos con **3516** usuarios disponibles para analizar, **84** no se han encontrado o han sido suspendidos.


#### Guardado de los tweets

In [45]:
grafo = Graph(auth_neo4j['host'], password=auth_neo4j['password'])
df = grafo.run(cypher="""MATCH (u:User) WHERE u.virtual="F" RETURN COUNT(u) AS number""").data()
df[0]['number']

742

Guardamos los tweets de la base de datos en **tweets.csv**

In [46]:
tabla = grafo.run(cypher="""MATCH (u:User)-[:tweeted]->(t:Tweet) RETURN u.id as id, u.screen_name as screen_name, t.content as content""").data()

In [47]:
with open('datos_final/tweets.csv', 'w') as file:
    csvw = csv.writer(file)
    csvw.writerow(['user_id', 'screen_name',
                   'tweet_id', 'tweet_text', 'tweet_creation_at', 'tweet_fav_count', 'tweet_rt_count', 
                   'is_reply', 'reply_id_status', 'reply_id_user',
                   'is_quote', 'quote_id_user', 'quote_id_status', 'quote_text', 'quote_creation_at', 'quote_fav_count', 'quote_rt_count',
                   'is_rt', 'rt_id_user', 'rt_id_status', 'rt_text', 'rt_creation_at', 'rt_fav_count', 'rt_rt_count'])

    for row in tabla:
        for tweet in row["content"]:
            new_tweet = []
            new_tweet = tweet.split("@@@")
            if new_tweet[5]:
                new_tweet[5] = True
            else:
                new_tweet[5] = False
            if new_tweet[8]:
                new_tweet[8] = True
            else:
                new_tweet[8] = False
            if new_tweet[15]:
                new_tweet[15] = True
            else:
                new_tweet[15] = False
            
    
            csvw.writerow([row["id"]] + [row["screen_name"]] + new_tweet)

In [48]:
df_tweets = pd.read_csv('datos_final/tweets.csv')
#creation_at --> mixed types pero no es problema,  error_bad_lines=False

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
df_tweets.user_id.drop_duplicates()

## Filtrado de usuarios de odio
Recojo usuarios que utilizan este tipo de léxico. Filtro estos usuarios y quedan como sospechosos de ser *hateful users*. Los guardo en un subset.

In [100]:
insult = open("lexicon/insults_lexicon.txt", "r")
inmigr = open("lexicon/immigrant_lexicon.txt", "r")
misog = open("lexicon/misogyny_lexicon.txt", "r")
xeno = open("lexicon/xenophobia_lexicon.txt", "r")
regexp = ""
for line in insult.readlines()[1:]:
    regexp += "({0})|".format(line.rstrip())
insult.close()
for line in inmigr.readlines()[1:]:
    regexp += "({0})|".format(line.rstrip())
inmigr.close()
for line in misog.readlines()[1:]:
    regexp += "({0})|".format(line.rstrip())
misog.close()
for line in xeno.readlines()[1:]:
    regexp += "({0})|".format(line.rstrip())
xeno.close()
regexp = regexp[:-1]
regexp = re.compile(regexp)

In [25]:
regexp

re.compile(r'(abanto)|(abrazafarolas)|(adufe)|(alcornoque)|(alfeñique)|(andurriasmo)|(arrastracueros)|(artabán)|(atarre)|(baboso)|(barrabás)|(barriobajero)|(bebecharcos)|(bellaco)|(belloto)|(berzotas)|(besugo)|(bobalicón)|(bocabuzón)|(bocachancla)|(bocallanta)|(boquimuelle)|(borrico)|(botarate)|(brasas)|(cabestro)|(cabezaalberca)|(cabezabuque)|(cachibache)|(cafre)|(cagalindes)|(cagarruta)|(calambuco)|(calamidad)|(caldúo)|(calientahielos)|(calzamonas)|(cansalmas)|(cantamañanas)|(capullo)|(caracaballo)|(caracartón)|(caraculo)|(caraflema)|(carajaula)|(carajote)|(carapapa)|(carapijo)|(cazurro)|(cebollino)|(cenizo)|(cenutrio)|(ceporro)|(cernícalo)|(charrán)|(chiquilicuatre)|(chirimbaina)|(chupacables)|(chupasangre)|(chupóptero)|(cierrabares)|(cipote)|(comebolsas)|(comechapas)|(comeflores)|(comestacas)|(cretino)|(cuerpoescombro)|(culopollo)|(descerebrado)|(desgarracalzas)|(dondiego)|(donnadie)|(echacantos)|(ejarramantas)|(energúmeno)|(esbaratabailes)|(eEscolimoso)|(escornacabras)|(estulto)|(

In [101]:
f = open("datos_final/tweets.csv", "r")
re.match(regexp, "")
csv_writer = csv.DictReader(f)

In [102]:
set_users = {}
for line in csv_writer:
    text = regexp.search(line["tweet_text"])
    retweet = regexp.search(line["rt_text"])
    quote = regexp.search(line["quote_text"])
    if text is not None or retweet is not None or quote is not None:
        set_users[line["user_id"]] = True
f.close()

In [103]:
len(set_users)

671

In [None]:
import pickle

#with open('datos_final/usuarios_mal_lexico.pickle', 'wb') as handle:
#    pickle.dump(set_users, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('datos_final/usuarios_mal_lexico.pickle', 'rb') as handle:
    set_users = pickle.load(handle)

In [16]:
set_users2 = {}
for k,v in set_users.items():
    if k in set(list(grafo_nx.nodes)):
        set_users2[k] = v

In [105]:
grafo_nx = nx.read_graphml("datos_final/usuarios.graphml")
grafo_nx = grafo_nx.reverse(copy=False)
nx.set_node_attributes(grafo_nx, name="mal_lexico", values=set_users)
nx.write_graphml(grafo_nx, "datos_final/usuarios_mal_lexico.graphml")

In [2]:
## modificacion tras suspension de 36 usuarios
grafo_nx = nx.readwrite.graphml.read_graphml("datos_final/usuarios.graphml")

In [9]:
csv1 = pd.read_csv('datos_final/usuarios_a_etiquetar.csv')
df_all_users =  pd.read_csv('exec_users/df_all_users.csv')
usuarios_baja = set(csv1.user_id.tolist()) - set(df_all_users.user_id.tolist())
for user in list(usuarios_baja):
    grafo_nx.remove_node(str(user))
nx.write_graphml(grafo_nx, "datos_final/usuarios.graphml")

In [24]:
g = nx.read_graphml("datos_final/usuarios_mal_lexico.graphml")
len(list(g.nodes))

743

In [65]:
df_d = pd.read_csv('datos_final/usuarios_a_etiquetar.csv') ## etiqueta con haterbert

### Obtención de las características.

In [None]:
set_usuarios = {}
with open("datos_final/usuarios_etiquetados.csv", "r") as f:
    csv_writer=csv.DictReader(f)
    for l in csv_writer:
        if l['hate'] == '1':
            set_usuarios[l['user_id']] = 1
        elif l['hate'] == '0':
            set_usuarios[l['user_id']] = 0

In [23]:
nodos = grafo_nx.nodes(data='hate')

haters = {}
normales = {}

for n in nodos:
    if n[1] == 1:  # hater
        for i in grafo_nx.neighbors(n[0]):
            haters[i] = True
    if n[1] == 0: # normal
        for i in grafo_nx.neighbors(n[0]):
            normales[n] = True

nx.set_node_attributes(grafo_nx, name="hater", values=False)
nx.set_node_attributes(grafo_nx, name="hater", values=haters)
nx.set_node_attributes(grafo_nx, name="normal", values=False)
nx.set_node_attributes(grafo_nx, name="normal", values=normales)

In [24]:
len(list(grafo_nx.nodes)) ## elijo k = 3516

743

In [55]:

grafo_nx = nx.read_graphml("datos_final/usuarios.graphml")
#nx.set_node_attributes(grafo_nx, 0.1, 'percolation')

betweenness = nx.betweenness_centrality(grafo_nx, normalized =False, endpoints = False) #, normalized=False
#eigenvector = nx.eigenvector_centrality(grafo_nx)
in_degree = nx.in_degree_centrality(grafo_nx)
out_degree = nx.out_degree_centrality(grafo_nx)
#a_clustering = nx.average_clustering(grafo_nx)
clustering = nx.clustering(grafo_nx) # ver si funciona
degree = nx.degree_centrality(grafo_nx) #
closeness = nx.closeness_centrality(grafo_nx) #
#percolation = nx.percolation_centrality(G=grafo_nx, attribute='percolation') #
#dispersion = nx.dispersion(grafo_nx) # no
#voterank = nx.voterank(grafo_nx) #
res = {i:v for i,v in betweenness.items() if betweenness[i] > 0.0}
print(res)

{'1247806839305031680': 16.5, '1174721480850104320': 17.0, '1331574487788228608': 6.0, '1387868813023186955': 4.0, '1923040616': 3.5, '3184834521': 4.0, '454351037': 1.0, '1353391775373811712': 1.0, '2755884282': 2.0, '468313376': 1.0}


In [74]:
a = list(eigenvector.values())[0]

In [75]:
#nx.betweenness_centrality(grafo_nx) ### esto es porque todos estan conectados 
eigenvector.update({'626174845': a})

In [76]:
nx.set_node_attributes(grafo_nx, name="betweenness", values=betweenness)
nx.set_node_attributes(grafo_nx, name="eigenvector", values=eigenvector)
nx.set_node_attributes(grafo_nx, name="in_degree", values=in_degree)
nx.set_node_attributes(grafo_nx, name="out_degree", values=out_degree)
nx.set_node_attributes(grafo_nx, name="clustering", values=clustering)
nx.set_node_attributes(grafo_nx, name="degree", values=degree)
nx.set_node_attributes(grafo_nx, name="closeness", values=closeness)
#nx.set_node_attributes(grafo_nx, name="percolation", values=percolation)
#nx.set_node_attributes(grafo_nx, name="dispersion", values=dispersion)
#nx.set_node_attributes(grafo_nx, name="voterank", values=voterank)

nx.write_graphml(grafo_nx, "datos_final/usuarios_hate_centralidad.graphml")

In [77]:
import math
import nltk
from nltk.tokenize import TweetTokenizer
import string
stopwords = nltk.corpus.stopwords.words('spanish')

grafo_nx = nx.read_graphml("datos_final/usuarios_hate_centralidad.graphml")

hate = nx.get_node_attributes(grafo_nx, "hate")

vec_hate = nx.get_node_attributes(grafo_nx, "hater")
vec_normal = nx.get_node_attributes(grafo_nx, "normal")

uname = nx.get_node_attributes(grafo_nx, "uname")
usuarios_nouname = set(set_usuarios.keys()) - set(uname.keys())
aux = dict.fromkeys(usuarios_nouname, math.nan)
uname = {**uname, **aux}

screen_name = nx.get_node_attributes(grafo_nx, "screen_name")

description = nx.get_node_attributes(grafo_nx, "description")
usuarios_nodescr = set(set_usuarios.keys()) - set(description.keys())
aux = dict.fromkeys(usuarios_nodescr, math.nan)
description = {**description, **aux}
#lang = nx.get_node_attributes(grafo_nx, "lang")

location = nx.get_node_attributes(grafo_nx, "location")
usuarios_noloc = set(set_usuarios.keys()) - set(location.keys())
aux = dict.fromkeys(usuarios_noloc, math.nan)
location = {**location, **aux}

created_at = nx.get_node_attributes(grafo_nx, "created_at")
verified = nx.get_node_attributes(grafo_nx, "verified")
statuses_count = nx.get_node_attributes(grafo_nx, "statuses_count")
followers_count = nx.get_node_attributes(grafo_nx, "followers_count")
followees_count = nx.get_node_attributes(grafo_nx, "followees_count")
favorites_count = nx.get_node_attributes(grafo_nx, "favorites_count")
listed_count = nx.get_node_attributes(grafo_nx, "listed_count")
#time_zone = nx.get_node_attributes(grafo_nx, "time_zone")
geo_enabled = nx.get_node_attributes(grafo_nx, "geo_enabled")

prof_img = nx.get_node_attributes(grafo_nx, "profile_image_url")
usuarios_noimg = set(set_usuarios.keys()) - set(prof_img.keys())
aux = dict.fromkeys(usuarios_noimg, math.nan)
prof_img = {**prof_img, **aux}

default_prof = nx.get_node_attributes(grafo_nx, "default_profile")
default_prof_im = nx.get_node_attributes(grafo_nx, "default_profile_image")

betweenness = nx.get_node_attributes(grafo_nx, "betweenness")
eigenvector = nx.get_node_attributes(grafo_nx, "eigenvector")
in_degree = nx.get_node_attributes(grafo_nx, "in_degree")
out_degree = nx.get_node_attributes(grafo_nx, "out_degree")
clustering = nx.get_node_attributes(grafo_nx, "clustering")
degree = nx.get_node_attributes(grafo_nx, "degree")
closeness = nx.get_node_attributes(grafo_nx, "closeness")
#percolation = nx.get_node_attributes(grafo_nx, "percolation")
#dispersion = nx.get_node_attributes(grafo_nx, "dispersion")
#voterank = nx.get_node_attributes(grafo_nx, "voterank")

tweet = TweetTokenizer()
for k,v in description.items():
    #new_val = re.sub("s+"," ", str(v))
    new_val = re.sub("[^-9A-Za-z ]", "" , str(v))
    new_val = tweet.tokenize(str(new_val))
    #new_val = " ".join([i for i in new_val if i not in string.punctuation])
    new_val = " ".join([i for i in new_val])
    words = nltk.tokenize.word_tokenize(new_val)
    new_val = [i for i in words if i not in stopwords]
    description[k] = ' '.join(str(x) for x in new_val)


In [80]:
usuarios = []
for user_id in hate.keys():
    #hater = "otro"
   # if hate[user_id] == 1:
   #     hater = "hater"
   # elif hate[user_id] == 0:
    #    hater = "normal"
    #description[user_id] = str(description[user_id])
    usuarios.append((user_id,
                     #hater,
                     #vec_hate[user_id],
                     #vec_normal[user_id],
                     #uname[user_id], 
                     #screen_name[user_id], 
                     #description[user_id], 
                     #location[user_id], 
                     #created_at[user_id], 
                     #verified[user_id],
                     #statuses_count[user_id],
                     #followers_count[user_id],
                     #followees_count[user_id],
                     #favorites_count[user_id],
                     #listed_count[user_id],
                     #geo_enabled[user_id],
                     #prof_img[user_id], 
                     #default_prof[user_id],
                     #default_prof_im[user_id],
                     betweenness[user_id],
                     eigenvector[user_id],
                     in_degree[user_id],
                     out_degree[user_id],
                     clustering[user_id],
                     degree[user_id],
                     closeness[user_id]
                    ))
    #, percolation[user_id], dispersion[user_id], voterank[user_id], time_zone[user_id] lang[user_id],

cols = ["user_id", 
        #"hate", 
        #"hate_vecino", 
        #"normal_vecino", 
        #"nombre_perfil", 
        #"screen_name", 
        #"descripcion", 
        #"localizacion", 
        #"fecha_creacion", 
        #"verificado",
        #"statuses_count", 
        #"followers_count", 
        #"followees_count", 
        #"favorites_count", 
        #"listed_count", 
        #"geo_enabled",
        #"profile_image_url", 
        #"profile_changed", 
        #"img_prof_changed",
        "betweenness", 
        "eigenvector", 
        "in_degree", 
        "out_degree",
        "clustering", 
        "degree", 
        "closeness"]
#, "percolation", "dispersion", "voterank", "time_zone" "idioma",

df = pd.DataFrame.from_records(usuarios, columns=cols)
df.to_csv("datos_final/usuarios_caracteristicas_centralidad_arregladas.csv", index=False)

### Extracción de las características textuales 1.0 (ahora está implementado en acc_analyzer.py)

In [27]:
#df1 = pd.read_csv("datos_final/usuarios_caracteristicas.csv")
#df2 = pd.read_csv("exec_users/df_all_users.csv")
#df1 = pd.read_csv("datos_final_743/usuarios_caracteristicas_centralidad_arregladas.csv")
#df2 = pd.read_csv("datos_final_743/usuarios_caracteristicas_finales.csv")
df1 = df1[["user_id",
           #"hate",
           "hate_vecino",
           #"normal_vecino",
           "nombre_perfil","screen_name","descripcion","localizacion","fecha_creacion","verificado",
           "statuses_count","followers_count","followees_count","favorites_count","listed_count","profile_changed","img_prof_changed",
           "betweenness","eigenvector","in_degree","out_degree","clustering","degree","closeness"]]
#df3 = df2
df2 = df2[["user_id",
           #"hate_x",
           #"hate_vecino",
           #"normal_vecino","nombre_perfil","screen_name","descripcion","localizacion","fecha_creacion","verificado",
           #"statuses_count","followers_count","followees_count","favorites_count","listed_count","profile_changed","img_prof_changed",
            "user_name","user_geo_enabled","profile_image_url","categories_profile_image_url","status_count","status_retrieving",
            "status_start_date","status_end_date","status_days","status_note","status_average_tweets_per_day",
            "activity_hourly_00:00","activity_hourly_01:00","activity_hourly_02:00","activity_hourly_03:00","activity_hourly_04:00",
            "activity_hourly_05:00","activity_hourly_06:00","activity_hourly_07:00","activity_hourly_08:00","activity_hourly_09:00","activity_hourly_10:00",
            "activity_hourly_11:00","activity_hourly_12:00","activity_hourly_13:00","activity_hourly_14:00","activity_hourly_15:00","activity_hourly_16:00",
            "activity_hourly_17:00","activity_hourly_18:00","activity_hourly_19:00","activity_hourly_20:00","activity_hourly_21:00","activity_hourly_22:00",
            "activity_hourly_23:00","activity_weekly_0","activity_weekly_1","activity_weekly_2","activity_weekly_3","activity_weekly_4","activity_weekly_5",
            "activity_weekly_6","top_languages","top_sources","geo_enabled_tweet_count","top_places","num_hashtags","top_hashtags","rt_count","top_retweeted_users",
            "num_mentions","top_mentioned_users","num_urls","top_referenced_domains","negativos","positivos","neutros","hate","no_hate","negativos_score",
            "positivos_score","neutros_score","hate_score","no_hate_score","is_hater","baddies","n_baddies","n_baddies_tweet","len_status","times_user_rt",
            "times_user_quotes","num_rts_to_tweets","num_favs_to_tweets","top_categories","misspelling_counter","leet_counter"]]
df = pd.merge(df2, df1, on="user_id")

In [31]:
df.to_csv("all datasets/labeled_reduced_caracteristicas_finales.csv", index=False)

In [42]:
#df_final.loc[df_final.label == 1]
df_final.to_csv("all datasets/labeled_reduced_1487.csv", index=False)

In [None]:
df = pd.read_csv('all datasets/labeled_reduced_caracteristicas_finales.csv')
df_texto = pd.read_csv('all datasets/tweet_hr_label_6K.csv')
df_texto = df_texto.rename(columns={"author_id": "user_id"})
df_texto = df_texto[['user_id', 'text', 'label']]
df_final = pd.merge(df_texto, df, on="user_id")
df_final