In [1]:
import pandas as pd
import psycopg2
import pandas.io.sql as psql
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from langdetect import detect
import re
import emoji
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import plot_confusion_matrix
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif, chi2, SelectKBest
from collections import Counter
from scipy.stats import entropy
import math
from scipy.sparse import find
import networkx as nx

unTPath = "../../../../../unT/ffunes/"

%matplotlib inline

sns.set_theme()

In [45]:
pd.set_option('display.max_colwidth', None)

In [2]:
try:
    conn = psycopg2.connect(user = "ffunes",
                            password = "location8",
                            host = "127.0.0.1",
                            port = "5432",
                            database = "elecciones_twitter")
except:
    print("No se pudo conectar a la base de datos")

In [None]:
tweets_content = pd.read_sql_query(
    "SELECT id, user_id, full_text FROM tweets WHERE coordinates IS NOT NULL"
    , con=conn 
)

In [None]:
tweets_content.head()

Cargamos los tweets geolocalizados y los guardamos en un csv

In [30]:
tweets_content.to_csv(unTPath + 'exact/content_exact.csv')

In [2]:
colnames = [
    "user_id",
    "full_text"
]

dtype = {
    "user_id": pd.Int64Dtype(),
    "full_text": str
}

tweets_content = pd.read_csv(unTPath + 'exact/content_exact.csv', engine = 'python', usecols=colnames, dtype=dtype, index_col=False )

In [4]:
tweets_content

Unnamed: 0,user_id,full_text
0,4709290155,#Puff #Shirley #capitone #design #diseño\n#Shi...
1,4709290155,#Puff #Shirley #capitone #design #diseño\n#Shi...
2,4709290155,#Sofa #sillones #Fundas #tusor #gervasoni #gra...
3,4709290155,#Sofa #sillones #Fundas #tusor #gervasoni #gra...
4,4709290155,#Sofa #sillones #Fundas #tusor #gervasoni #gra...
...,...,...
1022065,154678849,"#ludo con el más pequeño... en Paraná, Entre R..."
1022066,586838781,Arranco el #Todoscontraelhambre https://t.co/I...
1022067,598017419,Festejando el cumple de mi abuela!! #Buenos #A...
1022068,335696274,No empezó el superclásico copero y en casa ya ...


Agrupamos los tweets por usuario

In [4]:
#del tweets_content["id"]

tweets_content_gr = tweets_content.groupby(
    ["user_id"]
)['full_text'].apply(lambda x: ' ||| '.join(x)).reset_index(name='all_tweets')

In [5]:
users = pd.read_pickle(unTPath + 'exact/users_exact_coords.pkl')

In [6]:
users_with_content = pd.merge(
    left=users,
    right=tweets_content_gr,
    how='left',
    left_on=["id"],
    right_on=["user_id"],
    validate="1:1" 
)

del users_with_content["user_id"]

Extraemos todos los hashtags del contenido

In [7]:
def retrieve_hashtags(row):
    if row["all_tweets"] is None:
        return None
    
    all_hashtags = [x[1:] for x in re.findall(r'#\w+', str(row["all_tweets"]))]
    
    return [x for x in all_hashtags]

users_with_content["hashtags"] = users_with_content.apply(retrieve_hashtags, axis=1)

Aplicamos un filtro a los tweets eliminando urls, menciones, hashtags, simbolos especiales y emojis

In [8]:
replacements = [
    r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})',
    r'@\w+',
    r'#\w+',
    r'[!"$%&()*+,-./:;<=>?[\]^_`{}~#@]',
    r'(?<=[a-z])\'(?=[a-z])'
]

def parse_tweets(tweets):
    try:
        tweets = tweets.lower()

        for replacement in replacements:
            tweets = re.sub(replacement, '', tweets) 
        tweets = re.sub(emoji.get_emoji_regexp(), '', tweets)
        tweets = re.sub(r'\r\n?|\n', ' ', tweets)
        tweets = tweets.replace('\'', '')
        tweets = ' '.join(tweets.split())
        
        return tweets
    except:
        #print("Exception")
        return tweets
    
def simple_parse(row):
    tweets = row["all_tweets"]
    
    return parse_tweets(tweets)

users_with_content["all_tweets"] = users_with_content.apply(simple_parse, axis=1)

In [10]:
users_with_content.shape

(37146, 20)

Guardamos el como users_exact_content

In [11]:
users_with_content.to_pickle(unTPath + 'exact/users_exact_content.pkl')