In [1]:
# Carga de librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from wordcloud import WordCloud

In [2]:
# Natural Language Toolkit
import nltk

In [3]:
# Carga del set de datos
df = pd.read_csv('train.csv')

- `id` - a unique identifier for each tweet
- `text` - the text of the tweet
- `location` - the location the tweet was sent from (may be blank)
- `keyword` - a particular keyword from the tweet (may be blank)
- `target` - in train.csv only, this denotes whether a tweet is about a real disaster (`1`) or not (`0`)

#### Información general del set de datos

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


#### Muestra aleatoria del set de datos

In [5]:
df.sample(5)

Unnamed: 0,id,keyword,location,text,target
1338,1935,burning%20buildings,y/e/l,THIS SOUNDS LIKE A SONG YOU WOULD HEAR IN A MO...,0
1847,2657,crush,,'@jorrynja: 6. @ your bf/gf/crush ??' @Ter_ell ??,1
7556,10802,wrecked,"Click the link below, okay",The Twitter update pretty much wrecked the app,0
2122,3049,death,?s????ss? a?????,Ari's hints and snippets will be the death of me.,0
6880,9864,traumatised,Sweden,@Ruddyyyyyy @JamieGriff97 Jamie is too traumat...,0


#### Descripción de variables estadísticas del set de datos

In [6]:
df.describe(include=[np.object, np.number]).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,7613,,,,5441.93,3137.12,1.0,2734.0,5408.0,8146.0,10873.0
keyword,7552,221.0,fatalities,45.0,,,,,,,
location,5080,3341.0,USA,104.0,,,,,,,
text,7613,7503.0,11-Year-Old Boy Charged With Manslaughter of T...,10.0,,,,,,,
target,7613,,,,0.42966,0.49506,0.0,0.0,0.0,1.0,1.0


#### Conversión de tipos de datos

- `id` toma valores del rango `[1, 10873]`, entonces puede usarse el tipo `uint16`.
- `target` puede ser `0` ó `1`.

In [7]:
df['id'] = df['id'].astype(np.uint16)
df['target'] = df['target'].astype(np.uint8)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   uint16
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   uint8 
dtypes: object(3), uint16(1), uint8(1)
memory usage: 200.9+ KB


#### Búsqueda de duplicados

In [8]:
df.duplicated(subset=['id']).sum()

0

Se deduce que todos los tweets son únicos (no hay ducplicados) y que no hay atributos de `text` nulos. Además todas las filas tienen un `target`.

Ahora hay que analizar las palabras.

### Limpieza de datos

Se va a crear un nuevo DataFrame para almacenar los mensajes procesados.

In [9]:
df_text = df['text'].to_frame().copy()

#======= MUESTRA ALEATORIA =======
# Se elige un ancho para la visualización de columnas
pd.set_option('max_colwidth', 150)

display(df.sample(10, random_state=42))

# Se vuelve al valor por default de 'max_colwidth'
pd.reset_option('max_colwidth')

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-imaginable destruction.,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe CoL police can catch a pickpocket in Liverpool Stree... http://t.co/vXIn1gOq4Q,1
132,191,aftershock,,Aftershock back to school kick off was great. I want to thank everyone for making it possible. What a great night.,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerability. (3,0
5559,7934,rainstorm,,@Calum5SOS you look like you got caught in a rainstorm this is amazing and disgusting at the same time,0
1765,2538,collision,,my favorite lady came to our volunteer meeting\nhopefully joining her youth collision and i am excite http://t.co/Ij0wQ490cS,1
1817,2611,crashed,,@brianroemmele UX fail of EMV - people want to insert and remove quickly like a gas pump stripe reader. 1 person told me it crashed the POS,1
6810,9756,tragedy,"Los Angeles, CA",Can't find my ariana grande shirt this is a fucking tragedy,0
4398,6254,hijacking,"Athens,Greece",The Murderous Story Of AmericaÛªs First Hijacking http://t.co/EYUGk6byxr,1


In [10]:
import re
import string
df_cleaned = df.copy()

In [11]:
def cleaning_text_1(text):
    # Se convierte el texto a minúsculas.
    text = text.lower()
    # Se quitan los '#'.
    text = re.sub('#', '', text)
    # Se quitan los números.
    text = re.sub('\w*\d\w*', '', text)
    # Se quitan los saltos de línea.
    text = re.sub('\n', ' ', text)
    # Se eliminan las referencias a usuarios '@user'.
    text = re.sub('@\S*', '', text)
    # Se quitan vínculos URL.
    text = re.sub('https{0,1}:\/\/\S*', ' ', text)
    # Se simplifican múltiples espacios a uno solo.
    text = re.sub('(\ ){2,7}', ' ',text)
    # Se quitan los signos de puntuación.
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

df_cleaned['text'] = df_cleaned['text'].apply(lambda x: cleaning_text_1(x))

#======= MUESTRA ALEATORIA =======
# Se elige un ancho para la visualización de columnas
pd.set_option('max_colwidth', 150)

display(df_cleaned.sample(10, random_state=42))

# Se vuelve al valor por default de 'max_colwidth'
pd.reset_option('max_colwidth')

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,so you have a new weapon that can cause unimaginable destruction,1
2227,3185,deluge,,the famp things i do for gishwhes just got soaked in a deluge going for pads and tampons thx,0
5448,7769,police,UK,dt rt ûïthe col police can catch a pickpocket in liverpool stree,1
132,191,aftershock,,aftershock back to school kick off was great i want to thank everyone for making it possible what a great night,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma children of addicts develop a defensive self one that decreases vulnerability,0
5559,7934,rainstorm,,you look like you got caught in a rainstorm this is amazing and disgusting at the same time,0
1765,2538,collision,,my favorite lady came to our volunteer meeting hopefully joining her youth collision and i am excite,1
1817,2611,crashed,,ux fail of emv people want to insert and remove quickly like a gas pump stripe reader person told me it crashed the pos,1
6810,9756,tragedy,"Los Angeles, CA",cant find my ariana grande shirt this is a fucking tragedy,0
4398,6254,hijacking,"Athens,Greece",the murderous story of americaûªs first hijacking,1


### Generación de una matriz de términos de todos los mensajes.

In [16]:
text_raw = ' '.join(df_cleaned['text'])

# Se simplifican múltiples espacios a uno solo.
text_raw = re.sub('(\ ){2,7}', ' ',text_raw)

text_raw[0:1000]

'our deeds are the reason of this earthquake may allah forgive us all forest fire near la ronge sask canada all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected people receive wildfires evacuation orders in california just got sent this photo from ruby alaska as smoke from wildfires pours into a school rockyfire update california hwy closed in both directions due to lake county fire cafire wildfires flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas im on top of the hill and i can see a fire in the woods theres an emergency evacuation happening now in the building across the street im afraid that the tornado is coming to our area three people died from the heat wave so far haha south tampa is getting flooded hah wait a second i live in south tampa what am i gonna do what am i gonna do fvck flooding raining flooding florida tampabay tampa or days ive lost count flood i

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
data_dtm.columns

Index(['term', 0], dtype='object')

In [36]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform([text_raw])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm = data_dtm.T
data_dtm.reset_index(inplace=True)
data_dtm.rename(columns={'index':'term', 0:'count'}, inplace=True)
data_dtm

Unnamed: 0,term,count
0,aa,2
1,aaaa,1
2,aaaaaaallll,1
3,aaaaaand,1
4,aaarrrgghhh,1
...,...,...
14305,ûóher,1
14306,ûókody,1
14307,ûónegligence,1
14308,ûótech,1


---
---
## Análisis de veracidad del mensaje según su longitud

In [None]:
# Se agrega un columna para la longitud del mensaje.
df['length'] = df['text'].str.len()
# Se informan las mediciones pedidas.
print('Características de longitud del texto del mensaje')
print('Máxima:', df['length'].max(), \
'\tMínina:', df['length'].min(), \
'\tMedia:', np.round(df['length'].mean(), decimals = 0))

In [None]:
# Se arma nuevo DataFrame agrupando por longitud de mensaje
join = pd.DataFrame()
join['is_true'] = df.groupby('length')['target'].agg('sum')
join['total'] = df.groupby('length')['target'].agg('count')
join['portion_is_true'] = round(join['is_true'] / join['total'], 3)
# Se quitan los casos extremos.
clean = join.loc[join['portion_is_true'] < 1]
display(clean.nlargest(5, 'portion_is_true'))
print('Promedio de las cinco longitudes con mejor índice de veracidad:', \
clean.nlargest(5, 'portion_is_true').index.values.mean())
print('Mediana de las cinco longitudes con mejor índice de veracidad:', \
np.median(clean.nlargest(5, 'portion_is_true').index.values))

In [None]:
# Plot
plt.figure(figsize = (7,25))
sb.set(font_scale = 1.5)
plot = sb.heatmap(join[['portion_is_true']], cmap = 'plasma_r', \
cbar = True, xticklabels = [''])
plt.title('Veracidad de mensajes según longitud',fontsize = 25)
plot.set_xlabel('Nivel de veracidad', fontsize = 20)
plot.set_ylabel('Longitud del mensaje', fontsize = 20)
plt.xticks(fontsize = 17, rotation = 0, ha = 'center', va = 'top')
plt.yticks(fontsize = 17, rotation = 0)
plt.tight_layout() # Para que no recorte los títulos.
plt.show()