In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
# Selección del largo de las celdas del DataFrame
strech = True

# Cantidad mínima de repeticiones
repetitions = 5

In [3]:
if strech:
    pd.set_option('max_colwidth', 150)
else:
    pd.reset_option('max_colwidth')

In [25]:
df = pd.read_csv('../datos/train_keyword_corrected.csv')
df.head(10)

Unnamed: 0,keyword
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


### Se arma un corpus con las palabras

In [26]:
# Para evitar los NaN.
df_aux = df.dropna(subset=['keyword'])
df_aux.head(10)

Unnamed: 0,keyword
31,ablaze
32,ablaze
33,ablaze
34,ablaze
35,ablaze
36,ablaze
37,ablaze
38,ablaze
39,ablaze
40,ablaze


Ya no hay NaN.

In [23]:
%%time
# Se combinan todos los mensajes.
text_raw = ' '.join(df_aux['keyword'], )
# Se simplifican múltiples espacios a uno solo.
text_raw = re.sub('(\ ){2,7}', ' ',text_raw)
text_raw[0:1000]

CPU times: user 4.76 ms, sys: 215 µs, total: 4.98 ms
Wall time: 4.58 ms


'ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze ablaze accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident accident aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks aftershocks airplane accident airpla

In [27]:
%%time
# CountVectorizer ignora las stop words seleccionadas.
cv = CountVectorizer(stop_words='english')

data_cv = cv.fit_transform([text_raw])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm = data_dtm.T
# La columna tiene como nombre 0. Se reemplaza adecuadamente.
data_dtm.rename(columns={0:'frequency'},inplace=True)

data_dtm.head(5)

CPU times: user 8.7 ms, sys: 157 µs, total: 8.85 ms
Wall time: 8.36 ms


Unnamed: 0,frequency
ablaze,36
accident,70
aftershocks,34
airplane,35
ambulance,38


In [28]:
# Longitud del DataFrame de términos distintos.
data_dtm.shape[0]

224

In [32]:
# Se toman las primeras 10 palabras con mayor frecuencia.
top_words = data_dtm.sort_values(by='frequency', ascending=False).head(5)
top_words

Unnamed: 0,frequency
emergency,147
body,107
burning,106
buildings,105
storm,104


In [34]:
print('Más de...')
for n in np.arange(0,50,10):
    print('\t', n, 'repeticiones:', data_dtm.loc[data_dtm['frequency'] > n].shape[0])

Más de...
	 0 repeticiones: 224
	 10 repeticiones: 222
	 20 repeticiones: 220
	 30 repeticiones: 198
	 40 repeticiones: 23


In [36]:
# repetitions se setea al inicio del notebook.
data_dtm = data_dtm.loc[data_dtm['frequency'] > repetitions]
data_dtm.head()

Unnamed: 0,frequency
ablaze,36
accident,70
aftershocks,34
airplane,35
ambulance,38


In [37]:
data_dtm.shape[0]

224

### Se crea la matriz de acuerdo a las palabras filtradas

In [38]:
vectorizer = CountVectorizer(analyzer='word', vocabulary=data_dtm.index)

In [40]:
df.fillna(value='ñññ', inplace=True)

In [41]:
# Se genera una sparse matrix que cuenta los términos que aparecen en cada texto de los tweets.
X = vectorizer.fit_transform(df['keyword'])

In [42]:
print(vectorizer.get_feature_names())

['ablaze', 'accident', 'aftershocks', 'airplane', 'ambulance', 'annihilated', 'annihilation', 'apocalypse', 'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked', 'avalanche', 'bag', 'bagging', 'bags', 'bang', 'battle', 'bioterror', 'bioterrorism', 'blaze', 'blazing', 'bleeding', 'blew', 'blight', 'blizzard', 'blood', 'bloody', 'blown', 'body', 'bomb', 'bombed', 'bomber', 'bombing', 'bridge', 'buildings', 'burned', 'burning', 'bush', 'casualties', 'casualty', 'catastrophe', 'catastrophic', 'chemical', 'cliff', 'collapse', 'collapsed', 'collide', 'collided', 'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge', 'deluged', 'demolish', 'demolished', 'demolition', 'derail', 'derailed', 'derailment', 'desolate', 'desolation', 'destroy', 'destroyed', 'destruction', 'detonate', 'detonation', 'devastated', 'devastation', 'disaster', 'displaced', 'drought', 'drown', 'drowned', 'drowning', 'dust', 'e

In [43]:
X.toarray()[2]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [44]:
X.size

8499

In [45]:
X.toarray().size

1705312

In [46]:
%%time
df_term_matrix = pd.DataFrame(data=X.toarray())

CPU times: user 3.21 ms, sys: 3.86 ms, total: 7.07 ms
Wall time: 6.26 ms


In [47]:
df_term_matrix.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,214,215,216,217,218,219,220,221,222,223
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# El prefijo k_ indica que corresponde a la coilumna de keyword.
df_term_matrix = df_term_matrix.add_prefix('k_')
df_term_matrix.head(3)

Unnamed: 0,k_0,k_1,k_2,k_3,k_4,k_5,k_6,k_7,k_8,k_9,...,k_214,k_215,k_216,k_217,k_218,k_219,k_220,k_221,k_222,k_223
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Se genera el DataFrame con la matrix de términos.
df_term_matrix.to_csv('../datos/term_matrix_keyword.csv', index=False)

In [50]:
df_aux = pd.read_csv('../datos/term_matrix_keyword.csv')
df_aux.head(3)

Unnamed: 0,k_0,k_1,k_2,k_3,k_4,k_5,k_6,k_7,k_8,k_9,...,k_214,k_215,k_216,k_217,k_218,k_219,k_220,k_221,k_222,k_223
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
