In [32]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import json
from datetime import datetime
import chardet
import re

In [33]:
def safe_get(dictionary, *keys):
    """Safely get a value from nested dictionaries."""
    for key in keys:
        if isinstance(dictionary, dict) and key in dictionary:
            dictionary = dictionary[key]
        else:
            return None
    return dictionary

# Read the file and parse JSON data
tweets = []
with open('traficogt.txt', 'r') as file:
    for line in file:
        line = line.strip()
        if line.startswith('{'):
            try:
                tweet = json.loads(line.strip())

                usernames = [safe_get(mention, 'username') for mention in safe_get(tweet, 'mentionedUsers')]
                usernames = [username.lower() for username in usernames if username is not None]
                usernames = [re.sub(r"[^a-z0-9 ]", '', username) for username in usernames if username is not None]

                tweets.append({
                    'date': safe_get(tweet, 'date'),
                    'content': safe_get(tweet, 'rawContent'),
                    'username': safe_get(tweet, 'user', 'username'),
                    'replyCount': safe_get(tweet, 'replyCount'),
                    'retweetCount': safe_get(tweet, 'retweetCount'),
                    'quoteCount': safe_get(tweet, 'quoteCount'),
                    'bookmarkedCount': safe_get(tweet, 'bookmarkedCount'),
                    'likeCount': safe_get(tweet, 'likeCount'),
                    'viewCount': safe_get(tweet, 'viewCount'),
                    'inReplyToUser': safe_get(tweet, 'inReplyToUser', 'username'),
                    'mentionedUsers': usernames,
                    'hashtags': safe_get(tweet, 'hashtags'),
                })
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON: {line[:50]}...")
                print(json.JSONDecodeError)
        else:
            print(f"Skipping non-JSON line: {line[:50]}...")

# Create DataFrame
df = pd.DataFrame(tweets)

df

Unnamed: 0,date,content,username,replyCount,retweetCount,quoteCount,bookmarkedCount,likeCount,viewCount,inReplyToUser,mentionedUsers,hashtags
0,2024-09-12 14:22:06+00:00,Es comprensible la resolución... El ruso sabe ...,traficogt,0,0,0,0,1,393.0,,[],[]
1,2024-09-12 00:39:56+00:00,La corrupción de la @CC_Guatemala\nes descarad...,monymmorales,0,56,4,1,84,1587.0,,[ccguatemala],[]
2,2024-09-12 01:21:04+00:00,@PNCdeGuatemala @mingobguate @FJimenezmingob @...,animaldgalaccia,0,0,0,0,1,91.0,PNCdeGuatemala,"[pncdeguatemala, mingobguate, fjimenezmingob, ...",[]
3,2024-09-11 20:20:01+00:00,@amilcarmontejo @AztecaNoticiaGT @BancadaSemil...,EstacionDobleA,0,0,0,0,0,46.0,EstacionDobleA,"[amilcarmontejo, aztecanoticiagt, bancadasemil...",[]
4,2024-09-11 00:34:31+00:00,@soy_502 @AztecaNoticiaGT @CONAPgt @DenunciaEM...,CubReserva,0,0,0,0,1,171.0,CubReserva,"[soy502, aztecanoticiagt, conapgt, denunciaemp...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...
5600,2023-10-08 21:37:45+00:00,@hshetemul @traficogt Y de igual Manera quitan...,mvtrooper,0,0,0,0,0,3.0,hshetemul,"[hshetemul, traficogt, mpguatemala, cangguatem...",[]
5601,2023-10-08 21:37:19+00:00,@traficogt Es algo más fuerte que ellos no qui...,elmeronene1,0,0,0,0,0,21.0,traficogt,[traficogt],[]
5602,2023-10-08 21:31:24+00:00,@Factor4_GT @traficogt @CC_Guatemala @MPguatem...,mvtrooper,0,0,0,0,0,4.0,Factor4_GT,"[factor4gt, traficogt, ccguatemala, mpguatemala]",[]
5603,2023-10-08 21:13:33+00:00,@AgenciaOcote @traficogt Vieja ignorante,91072dff5a5a4f1,0,0,0,0,0,17.0,AgenciaOcote,"[agenciaocote, traficogt]",[]


## Preprocesamiento de datos

### Cambios en content

In [34]:
df['content'] = df['content'].str.lower()
df['content'] = df['content'].str.replace(r'http\S+|www.\S+', '', regex=True)
df['content'] = df['content'].str.replace(r'@\S+|#\S+', '', regex=True)
df['content'] = df['content'].str.replace(r"[^a-z0-9 ]", '', regex=True)

In [35]:
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

stemmer = SnowballStemmer('spanish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/estebandonis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
stop_words = set(stopwords.words('spanish'))

def preprocess_text(text):
    words = text.split()
    processed_words = [word for word in words if word not in stop_words]
    return ' '.join(processed_words)

In [37]:
df['content'] = df['content'].apply(preprocess_text)

### Cambios en username

In [38]:
df['username'] = df['username'].str.lower()
df['content'] = df['content'].str.replace(r"[^a-z0-9 ]", '', regex=True)

### Cambios en inReplyToUser

In [39]:
df['inReplyToUser'] = df['inReplyToUser'].str.lower()
df['content'] = df['content'].str.replace(r"[^a-z0-9 ]", '', regex=True)

## Análisis Exploratorio

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5605 entries, 0 to 5604
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             5605 non-null   object 
 1   content          5605 non-null   object 
 2   username         5605 non-null   object 
 3   replyCount       5605 non-null   int64  
 4   retweetCount     5605 non-null   int64  
 5   quoteCount       5605 non-null   int64  
 6   bookmarkedCount  5605 non-null   int64  
 7   likeCount        5605 non-null   int64  
 8   viewCount        5601 non-null   float64
 9   inReplyToUser    4151 non-null   object 
 10  mentionedUsers   5605 non-null   object 
 11  hashtags         5605 non-null   object 
dtypes: float64(1), int64(5), object(6)
memory usage: 525.6+ KB
