In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Selección del largo de las celdas del DataFrame
strech = True

# Cantidad mínima de repeticiones
repetitions = 35

In [3]:
if strech:
    pd.set_option('max_colwidth', 150)
else:
    pd.reset_option('max_colwidth')

In [4]:
df = pd.read_csv('../datos/train_text_corrected.csv', usecols=['text'])
df.head(5)

Unnamed: 0,text
0,our deeds are the reason of this earthquake may allah forgive us all
1,forest fire near la range ask canada
2,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
3,people receive wildfire evacuation orders in california
4,just got sent this photo from ruby alaska as smoke from wildfire pours into a school


Se va a armar una matriz de términos.

# Primero se buscan los términos más relevantes

# Primero se buscan los términos más relevantes

In [5]:
%%time
# Se combinan todos los mensajes.
text_raw = ' '.join(df['text'])
# Se simplifican múltiples espacios a uno solo.
text_raw = re.sub('(\ ){2,7}', ' ',text_raw)
text_raw[0:1000]

CPU times: user 23.3 ms, sys: 0 ns, total: 23.3 ms
Wall time: 23.4 ms


'our deeds are the reason of this earthquake may allah forgive us all forest fire near la range ask canada all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected people receive wildfire evacuation orders in california just got sent this photo from ruby alaska as smoke from wildfire pours into a school rockyfire update california why closed in both directions due to lake county fire afire wildfire flood disaster heavy rain causes flash flooding of streets in manitou colorado springs areas im on top of the hill and i can see a fire in the woods theres an emergency evacuation happening now in the building across the street im afraid that the tornado is coming to our area three people died from the heat wave so far aha south tampa is getting flooded hah wait a second i live in south tampa what am i gonna do what am i gonna do fuck flooding raining flooding florida tambay tampa or days ive lost count flood in ago my

`text_raw` es un string, perio para el siuguiente paso se necesita un iterable. Se usa `[text_raw]`.

In [6]:
%%time
# CountVectorizer ignora las stop words seleccionadas.
cv = CountVectorizer(stop_words='english')

data_cv = cv.fit_transform([text_raw])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm = data_dtm.T
# La columna tiene como nombre 0. Se reemplaza adecuadamente.
data_dtm.rename(columns={0:'frequency'},inplace=True)

data_dtm.head(5)

CPU times: user 89.4 ms, sys: 6.8 ms, total: 96.2 ms
Wall time: 95.7 ms


Unnamed: 0,frequency
aa,2
aaa,1
aaaaa,1
aaaaaaallll,1
aaaaaand,1


In [7]:
# Longitud del DataFrame de términos distintos.
data_dtm.shape[0]

12823

Se ve que luego de la correción y demás incorporaciones a la limpieza del texto, la cantidad de palabras distintas encontradas pasó de 14301 a 12823.

### Se quitan los términos que se repiten poco.

In [8]:
# Se toman las primeras 10 palabras con mayor frecuencia.
top_words = data_dtm.sort_values(by='frequency', ascending=True).head(5)
top_words

Unnamed: 0,frequency
ñransomwareñ,1
hailstormwindstorm,1
hailing,1
haile,1
haildamage,1


### Cantidad de palabras luego de remoción de términos con poca repetición

In [9]:
print('Más de...')
for n in np.arange(0,301,50):
    print('\t', n, 'repeticiones:', data_dtm.loc[data_dtm['frequency'] > n].shape[0])

Más de...
	 0 repeticiones: 12823
	 50 repeticiones: 119
	 100 repeticiones: 30
	 150 repeticiones: 11
	 200 repeticiones: 7
	 250 repeticiones: 4
	 300 repeticiones: 3


In [10]:
print('Más de...')
for n in np.arange(0,101,10):
    print('\t', n, 'repeticiones:', data_dtm.loc[data_dtm['frequency'] > n].shape[0])

Más de...
	 0 repeticiones: 12823
	 10 repeticiones: 1226
	 20 repeticiones: 648
	 30 repeticiones: 412
	 40 repeticiones: 212
	 50 repeticiones: 119
	 60 repeticiones: 92
	 70 repeticiones: 68
	 80 repeticiones: 54
	 90 repeticiones: 42
	 100 repeticiones: 30


### En un intervalo con más granularidad.

In [11]:
print('Más de...')
for n in np.arange(30,41,1):
    print('\t', n, 'repeticiones:', data_dtm.loc[data_dtm['frequency'] > n].shape[0])

Más de...
	 30 repeticiones: 412
	 31 repeticiones: 391
	 32 repeticiones: 371
	 33 repeticiones: 351
	 34 repeticiones: 333
	 35 repeticiones: 306
	 36 repeticiones: 282
	 37 repeticiones: 262
	 38 repeticiones: 245
	 39 repeticiones: 227
	 40 repeticiones: 212


### Se eligen palabras con más de 35 repeticiones.

In [12]:
# repetitions se setea al inicio del notebook.
data_dtm = data_dtm.loc[data_dtm['frequency'] > repetitions]
data_dtm.head()

Unnamed: 0,frequency
accident,87
air,41
airplane,37
ambulance,40
amp,303


In [13]:
data_dtm.shape[0]

306

### Se crea la matriz de acuerdo a las palabras filtradas

In [14]:
vectorizer = CountVectorizer(analyzer='word', vocabulary=data_dtm.index)

In [15]:
# Se genera una sparse matrix que cuenta los términos que aparecen en cada texto de los tweets.
X = vectorizer.fit_transform(df['text'])

In [16]:
print(vectorizer.get_feature_names())



In [17]:
X.toarray()[2]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [18]:
X.size

18001

In [19]:
X.toarray().size

2329578

In [20]:
%%time
df_term_matrix = pd.DataFrame(data=X.toarray())

CPU times: user 3.3 ms, sys: 3.73 ms, total: 7.03 ms
Wall time: 6.72 ms


In [21]:
df_term_matrix.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,300,301,302,303,304,305
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# El prefijo t_ indica que corresponde a la coilumna de texto.
df_term_matrix = df_term_matrix.add_prefix('t_')
df_term_matrix.head(3)

Unnamed: 0,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,...,t_296,t_297,t_298,t_299,t_300,t_301,t_302,t_303,t_304,t_305
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Se genera el DataFrame con la matrix de términos.
df_term_matrix.to_csv('../datos/term_matrix_text.csv', index=False)

In [24]:
df_aux = pd.read_csv('../datos/term_matrix_text.csv')
df_aux.head(3)

Unnamed: 0,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,...,t_296,t_297,t_298,t_299,t_300,t_301,t_302,t_303,t_304,t_305
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
