#### Procesamiento anterior de los datos

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import cycle, islice

%matplotlib inline

In [2]:
entrenamiento_df = pd.read_csv('Archivos/entrenamiento_df.csv')

In [3]:
entrenamiento_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df = pd.read_csv('Archivos/test_df.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
# 'longitud_text' contiene la longitud de la columna 'text'. 
test_df['text']=test_df['text'].astype(str)
test_df['longitud_text']=test_df.text.str.len()
test_df.head(10)

Unnamed: 0,id,keyword,location,text,longitud_text
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [6]:
#agrego columna: cantidad de palabras de text

test_df['cantidad_palabras']=test_df['text'].apply(lambda x: len(x.split()) )
test_df.head(10)

Unnamed: 0,id,keyword,location,text,longitud_text,cantidad_palabras
0,0,,,Just happened a terrible car crash,34,6
1,2,,,"Heard about #earthquake is different cities, s...",64,9
2,3,,,"there is a forest fire at spot pond, geese are...",96,19
3,9,,,Apocalypse lighting. #Spokane #wildfires,40,4
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45,8
5,12,,,We're shaking...It's an earthquake,34,4
6,21,,,They'd probably still show more life than Arse...,72,12
7,22,,,Hey! How are you?,17,4
8,27,,,What a nice hat?,16,4
9,29,,,Fuck off!,9,2


In [7]:
# 'longitud_text' contiene la longitud de la columna 'text'. 
entrenamiento_df['text']=entrenamiento_df['text'].astype(str)
entrenamiento_df['longitud_text']=entrenamiento_df.text.str.len()
entrenamiento_df.head(10)

Unnamed: 0,id,keyword,location,text,target,longitud_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,38
2,5,,,All residents asked to 'shelter in place' are ...,1,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,110
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,95
7,13,,,I'm on top of the hill and I can see a fire in...,1,59
8,14,,,There's an emergency evacuation happening now ...,1,79
9,15,,,I'm afraid that the tornado is coming to our a...,1,52


In [8]:
#agrego columna: cantidad de palabras de text

entrenamiento_df['cantidad_palabras']=entrenamiento_df['text'].apply(lambda x: len(x.split()) )
entrenamiento_df.head(10)


Unnamed: 0,id,keyword,location,text,target,longitud_text,cantidad_palabras
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,110,18
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,95,14
7,13,,,I'm on top of the hill and I can see a fire in...,1,59,15
8,14,,,There's an emergency evacuation happening now ...,1,79,12
9,15,,,I'm afraid that the tornado is coming to our a...,1,52,10


### Agrego features

In [9]:
entrenamiento_df = entrenamiento_df.fillna('0')
train = entrenamiento_df.drop(['id','keyword','location','text'], axis=1)

In [10]:
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras
0,1,69,13
1,1,38,7
2,1,133,22
3,1,65,8
4,1,88,16


In [11]:
# Agrego la columna keyword_mean con mean encoding
mean_keyword = entrenamiento_df.groupby('keyword')['target'].mean()
mean_keyword[:5]

keyword
0                    0.688525
ablaze               0.361111
accident             0.685714
aftershock           0.000000
airplane accident    0.857143
Name: target, dtype: float64

In [12]:
train['keyword_mean'] = entrenamiento_df['keyword'].replace(mean_keyword)
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras,keyword_mean
0,1,69,13,0.688525
1,1,38,7,0.688525
2,1,133,22,0.688525
3,1,65,8,0.688525
4,1,88,16,0.688525


In [13]:
# Agrego la columna location_mean con mean encoding
mean_location = entrenamiento_df.groupby('location')['target'].mean()
train['location_mean'] = entrenamiento_df['location'].replace(mean_location)
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras,keyword_mean,location_mean
0,1,69,13,0.688525,0.424398
1,1,38,7,0.688525,0.424398
2,1,133,22,0.688525,0.424398
3,1,65,8,0.688525,0.424398
4,1,88,16,0.688525,0.424398


In [14]:
# Agrego columna numerales con la suma de los numerales del texto
train['numerales'] = entrenamiento_df['text'].agg(lambda x: x.count('#'))

In [15]:
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras,keyword_mean,location_mean,numerales
0,1,69,13,0.688525,0.424398,1
1,1,38,7,0.688525,0.424398,0
2,1,133,22,0.688525,0.424398,0
3,1,65,8,0.688525,0.424398,1
4,1,88,16,0.688525,0.424398,2


In [16]:
train.to_csv('Archivos/train_5_features.csv', index=False)

In [17]:
test_df = test_df.fillna('0')
test = test_df.drop(['keyword','location','text'], axis=1)
test.shape

(3263, 3)

In [18]:
# Agrego la columna keyword_mean con el mean encoding del set train
test['keyword_mean'] = test_df['keyword'].replace(mean_keyword)

In [19]:
# Agrego la columna location_mean con el mean encoding del set train
test['location_mean'] = test_df['location'].replace(mean_location)

In [20]:
# Agrego columna numerales con la suma de los numerales del texto
test['numerales'] = test_df['text'].agg(lambda x: x.count('#'))

In [21]:
test.head()

Unnamed: 0,id,longitud_text,cantidad_palabras,keyword_mean,location_mean,numerales
0,0,34,6,0.688525,0.424398,0
1,2,64,9,0.688525,0.424398,1
2,3,96,19,0.688525,0.424398,0
3,9,40,4,0.688525,0.424398,2
4,11,45,8,0.688525,0.424398,0


In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 6 columns):
id                   3263 non-null int64
longitud_text        3263 non-null int64
cantidad_palabras    3263 non-null int64
keyword_mean         3263 non-null float64
location_mean        3263 non-null object
numerales            3263 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 153.0+ KB


In [23]:
# A las ubicaciones que no estuvieron en el train les asigno el valor 0.5
ubicaciones = test['location_mean'].agg(lambda x: pd.api.types.is_float(x))
test['location_mean'] = test['location_mean'].where(ubicaciones,0.5)

In [24]:
test.to_csv('Archivos/test_5_features.csv', index=False)

### Creo nuevos archivos separando el texto, eliminando stop words y aplicando streamming

In [3]:
import nltk

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
from nltk.stem import PorterStemmer

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/cecilia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cecilia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
train = pd.read_csv('Archivos/entrenamiento_df.csv')
test = pd.read_csv('Archivos/test_df.csv')

In [33]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [34]:
train['text'][2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [35]:
# Separa en palabras, ignorando los signos de puntuación
palabras = RegexpTokenizer(r'\w+').tokenize(train['text'][2])
palabras

['All',
 'residents',
 'asked',
 'to',
 'shelter',
 'in',
 'place',
 'are',
 'being',
 'notified',
 'by',
 'officers',
 'No',
 'other',
 'evacuation',
 'or',
 'shelter',
 'in',
 'place',
 'orders',
 'are',
 'expected']

In [36]:
# Se quitan las stop words
palabras = [w for w in palabras if not w in stopwords.words('english')]
palabras

['All',
 'residents',
 'asked',
 'shelter',
 'place',
 'notified',
 'officers',
 'No',
 'evacuation',
 'shelter',
 'place',
 'orders',
 'expected']

In [37]:
# Se vuelven las palabras a su raíz
[PorterStemmer().stem(word) for word in palabras]

[u'All',
 u'resid',
 u'ask',
 u'shelter',
 u'place',
 u'notifi',
 u'offic',
 u'No',
 u'evacu',
 u'shelter',
 u'place',
 u'order',
 u'expect']

In [38]:
# Separo el texto en palabras
train['text'] = train['text'].apply(lambda x: RegexpTokenizer(r'\w+').tokenize(x))

In [39]:
# Elimino las stop words y paso las palabras a minusculas
train['text'] = train['text'].apply(lambda x: [w.lower() for w in x if w not in stopwords.words('english')])

  """Entry point for launching an IPython kernel.


In [40]:
# Convierto las palabras a su raíz
train['text'] = train['text'].apply(lambda x: [PorterStemmer().stem(w.decode("utf8", "ignore")) for w in x])

In [41]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deed, reason, earthquak, may, allah, for...",1
1,4,,,"[forest, fire, near, la, rong, sask, canada]",1
2,5,,,"[all, resid, ask, shelter, place, notifi, offi...",1
3,6,,,"[13, 000, peopl, receiv, wildfir, evacu, order...",1
4,7,,,"[just, got, sent, photo, rubi, alaska, smoke, ...",1


In [44]:
train.to_csv('Archivos/entrenamiento_nltk.csv', index=False)

In [42]:
# Hago lo mismo con el archivo test
test['text'] = test['text'].apply(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
test['text'] = test['text'].apply(lambda x: [w.lower() for w in x if w not in stopwords.words('english')])
test['text'] = test['text'].apply(lambda x: [PorterStemmer().stem(w.decode("utf8", "ignore")) for w in x])

  This is separate from the ipykernel package so we can avoid doing imports until


In [43]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,"[just, happen, terribl, car, crash]"
1,2,,,"[heard, earthquak, differ, citi, stay, safe, e..."
2,3,,,"[forest, fire, spot, pond, gees, flee, across,..."
3,9,,,"[apocalyps, light, spokan, wildfir]"
4,11,,,"[typhoon, soudelor, kill, 28, china, taiwan]"


In [45]:
test.to_csv('Archivos/test_nltk.csv', index=False)

### Agrego tres nuevos features

In [13]:
entrenamiento_df = pd.read_csv('Archivos/entrenamiento_nltk.csv', converters={'text': eval})
train = pd.read_csv('Archivos/train_5_features.csv')

In [14]:
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras,keyword_mean,location_mean,numerales
0,1,69,13,0.688525,0.424398,1
1,1,38,7,0.688525,0.424398,0
2,1,133,22,0.688525,0.424398,0
3,1,65,8,0.688525,0.424398,1
4,1,88,16,0.688525,0.424398,2


In [15]:
entrenamiento_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deed, reason, earthquak, may, allah, for...",1
1,4,,,"[forest, fire, near, la, rong, sask, canada]",1
2,5,,,"[all, resid, ask, shelter, place, notifi, offi...",1
3,6,,,"[13, 000, peopl, receiv, wildfir, evacu, order...",1
4,7,,,"[just, got, sent, photo, rubi, alaska, smoke, ...",1


In [16]:
# Agrego feature con la cantidad de palabras relevantes del texto
train['cant_palabras_relevantes'] = entrenamiento_df['text'].agg(lambda x: len(x))

In [17]:
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras,keyword_mean,location_mean,numerales,cant_palabras_relevantes
0,1,69,13,0.688525,0.424398,1,8
1,1,38,7,0.688525,0.424398,0,7
2,1,133,22,0.688525,0.424398,0,13
3,1,65,8,0.688525,0.424398,1,8
4,1,88,16,0.688525,0.424398,2,10


In [18]:
# Agreco columnas con las longitudes de la palabra mas larga y la palabra mas corta del texto
train['long_palabra_larga'] = entrenamiento_df['text'].agg(lambda x: max([len(i) for i in x]))
train['long_palabra_corta'] = entrenamiento_df['text'].agg(lambda x: min([len(i) for i in x]))

In [19]:
train.head()

Unnamed: 0,target,longitud_text,cantidad_palabras,keyword_mean,location_mean,numerales,cant_palabras_relevantes,long_palabra_larga,long_palabra_corta
0,1,69,13,0.688525,0.424398,1,8,9,2
1,1,38,7,0.688525,0.424398,0,7,6,2
2,1,133,22,0.688525,0.424398,0,13,7,2
3,1,65,8,0.688525,0.424398,1,8,10,2
4,1,88,16,0.688525,0.424398,2,10,7,3


In [20]:
train['long_palabra_corta'].value_counts()

2     3364
1     2198
3      857
0      687
4      368
5       92
6       33
7        7
8        3
10       2
14       1
9        1
Name: long_palabra_corta, dtype: int64

In [21]:
train.to_csv('Archivos/train_8_features.csv', index=False)

In [22]:
test_df = pd.read_csv('Archivos/test_nltk.csv', converters={'text': eval})
test = pd.read_csv('Archivos/test_5_features.csv')

In [23]:
# Repito las columnas para el archivo test
test['cant_palabras_relevantes'] = test_df['text'].agg(lambda x: len(x))
test['long_palabra_larga'] = test_df['text'].agg(lambda x: max([len(i) for i in x]))
test['long_palabra_corta'] = test_df['text'].agg(lambda x: min([len(i) for i in x]))

In [24]:
test.head()

Unnamed: 0,id,longitud_text,cantidad_palabras,keyword_mean,location_mean,numerales,cant_palabras_relevantes,long_palabra_larga,long_palabra_corta
0,0,34,6,0.688525,0.424398,0,5,7,3
1,2,64,9,0.688525,0.424398,1,7,9,4
2,3,96,19,0.688525,0.424398,0,11,6,1
3,9,40,4,0.688525,0.424398,2,4,9,5
4,11,45,8,0.688525,0.424398,0,6,8,2


In [25]:
test.to_csv('Archivos/test_8_features.csv', index=False)

### Creo nuevos features mediante vectores de texto ya entrenados

Separo el texto en palabras y elimino las stop words. 
No aplico streamming para quedarme con las palabras completas.

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/cecilia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cecilia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
train = pd.read_csv('Archivos/entrenamiento_df.csv')
test = pd.read_csv('Archivos/test_df.csv')

In [19]:
# Separo el texto en palabras
# Elimino las stop words y paso las palabras a minusculas
train['text'] = train['text'].apply(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
train['text'] = train['text'].apply(lambda x: [w.lower() for w in x if w not in stopwords.words('english')])

In [20]:
# Hago lo mismo con el archivo test
test['text'] = test['text'].apply(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
test['text'] = test['text'].apply(lambda x: [w.lower() for w in x if w not in stopwords.words('english')])

In [11]:
import gensim.downloader as api

In [12]:
# load pre-trained word-vectors from gensim-data
word_vectors = api.load("glove-wiki-gigaword-100") 

In [11]:
train['text'][0]

['our', 'deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']

In [271]:
word_vectors.most_similar(positive=train['text'][0])

[('we', 0.8172246217727661),
 ('not', 0.8154860734939575),
 ('what', 0.8063154816627502),
 ('do', 0.8049666285514832),
 ('...', 0.8043433427810669),
 ('know', 0.8036221861839294),
 ('come', 0.7973394989967346),
 ('if', 0.7953765988349915),
 ('why', 0.7918872237205505),
 ('believe', 0.7906121015548706)]

In [272]:
word_vectors.most_similar(negative=train['text'][0])

[('purva.patel@chron.com', 0.698187530040741),
 ('tom.fowler@chron.com', 0.6924424767494202),
 ('brett.clanton@chron.com', 0.6868560910224915),
 ('spodumene', 0.6839196085929871),
 ('atentamente', 0.6803922653198242),
 ('___________________________________________________________',
  0.6781449317932129),
 ('rungfapaisarn', 0.6756051778793335),
 ('ryryryryryry', 0.675279438495636),
 ('jenalia.moreno@chron.com', 0.6748037934303284),
 ('kraz', 0.6747796535491943)]

In [274]:
word_vectors.doesnt_match(train['text'][0])

'earthquake'

In [269]:
train['text'][1]

['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']

In [268]:
word_vectors.most_similar(positive=train['text'][1])

[('northern', 0.6509563326835632),
 ('southern', 0.6403713226318359),
 ('northeast', 0.6403039693832397),
 ('park', 0.6400458812713623),
 ('mountain', 0.6395445466041565),
 ('southwest', 0.6368558406829834),
 ('river', 0.6357375383377075),
 ('located', 0.6302077174186707),
 ('northwest', 0.6292986273765564),
 ('area', 0.6256020069122314)]

In [23]:
# Elimino las palabras del texto que no estan en el vocabulario
def vocabulario(palabras):
    lista = []
    for w in palabras:
        if w in word_vectors.vocab:
            lista.append(w) 
    if (len(lista) == 0):
        return ['nothing']
    return lista

In [13]:
similares = train['text'].agg(lambda x: word_vectors.most_similar(positive=vocabulario(x), topn=1))

In [14]:
similares

0             [(we, 0.8172246217727661)]
1       [(northern, 0.6509563326835632)]
2           [(they, 0.8379936218261719)]
3             [(10, 0.7982413172721863)]
4            [(out, 0.7865819334983826)]
                      ...               
7608     [(building, 0.754969596862793)]
7609        [(this, 0.8649874329566956)]
7610          [(03, 0.6322605013847351)]
7611      [(another, 0.787148118019104)]
7612        [(last, 0.7914666533470154)]
Name: text, Length: 7613, dtype: object

In [15]:
palabras_similares = [x[0][0] for x in similares]
semejanzas = [x[0][1] for x in similares]

In [16]:
palabras_similares[:5]

['we', 'northern', 'they', '10', 'out']

In [17]:
semejanzas[0:5]

[0.8172246217727661,
 0.6509563326835632,
 0.8379936218261719,
 0.7982413172721863,
 0.7865819334983826]

In [136]:
sim_train = pd.DataFrame({'target':train['target'], 'semejanza':semejanzas})
sim_train.to_csv('Archivos/train_semejanzas.csv', index=False)

In [137]:
sim_train.head()

Unnamed: 0,target,semejanza
0,1,0.817225
1,1,0.650956
2,1,0.837994
3,1,0.798241
4,1,0.786582


In [138]:
train_palabras = pd.DataFrame({'target':train['target'], 'palabra_similar':palabras_similares})
train_palabras['palabra_similar'].value_counts()

so           414
one          405
you          378
just         363
this         295
            ... 
landfall       1
fabric         1
group          1
lane           1
agreement      1
Name: palabra_similar, Length: 937, dtype: int64

In [139]:
# Agrego feature con el mean encoding de las palabras similares
mean_palabra_similar = train_palabras.groupby('palabra_similar')['target'].agg(['mean','count'])
mean_palabra_similar.count()

mean     937
count    937
dtype: int64

In [140]:
# Dejo solo los promedios de las palabras con mas de 5 ocurrencias
mean_palabra_similar = mean_palabra_similar[mean_palabra_similar['count'] > 5].drop(['count'],axis=1)
mean_palabra_similar.count()

mean    173
dtype: int64

Son pocos registros como para crear otro feature

In [102]:
# Encuentro las palabras que menos se acercan a los textos
diferentes = train['text'].agg(lambda x: word_vectors.most_similar(negative=vocabulario(x), topn=1))

In [105]:
diferentes

0            [(purva.patel@chron.com, 0.698187530040741)]
1                          [(yassen, 0.5855438113212585)]
2                            [(zety, 0.7851395606994629)]
3                   [(rungfapaisarn, 0.7050082087516785)]
4       [(____________________________________________...
                              ...                        
7608                          [(zety, 0.750871479511261)]
7609                         [(zety, 0.8317862749099731)]
7610                         [(rw95, 0.6121312379837036)]
7611    [(____________________________________________...
7612                         [(zety, 0.8025525212287903)]
Name: text, Length: 7613, dtype: object

In [141]:
palabras_diferentes = [x[0][0] for x in diferentes]
diferencias = [x[0][1] for x in diferentes]

In [142]:
palabras_diferentes[:5]

['purva.patel@chron.com',
 'yassen',
 'zety',
 'rungfapaisarn',
 '___________________________________________________________']

In [143]:
diferencias[0:5]

[0.698187530040741,
 0.5855438113212585,
 0.7851395606994629,
 0.7050082087516785,
 0.7276370525360107]

In [144]:
sim_train['diferencia'] = diferencias

In [145]:
sim_train.head()

Unnamed: 0,target,semejanza,diferencia
0,1,0.817225,0.698188
1,1,0.650956,0.585544
2,1,0.837994,0.78514
3,1,0.798241,0.705008
4,1,0.786582,0.727637


In [146]:
sim_train.to_csv('Archivos/train_semejanzas.csv', index=False)

In [147]:
train_palabras['palabra_diferente'] = palabras_diferentes
train_palabras['palabra_diferente'].value_counts()

___________________________________________________________    1893
zety                                                           1812
brett.clanton@chron.com                                         574
ryryryryryry                                                    285
tom.fowler@chron.com                                            217
                                                               ... 
ojcl                                                              1
ukita                                                             1
zongmi                                                            1
katzenbach                                                        1
wattanachai                                                       1
Name: palabra_diferente, Length: 1158, dtype: int64

In [148]:
# Agrego feature con el mean encoding de las palabras diferentes
mean_palabra_dif = train_palabras.groupby('palabra_diferente')['target'].agg(['mean','count'])
mean_palabra_dif.count()

mean     1158
count    1158
dtype: int64

In [149]:
mean_palabra_dif['count'].mean()

6.57426597582038

In [150]:
# Dejo solo los promedios de las palabras con mas de 5 ocurrencias
mean_palabra_dif = mean_palabra_dif[mean_palabra_dif['count'] > 5].drop(['count'],axis=1)
mean_palabra_dif.count()

mean    75
dtype: int64

Son pocos registros como para crear otro feature

In [119]:
# Encuentro la palabra que no coincide con el resto del texto
infiltrados = train['text'].agg(lambda x: word_vectors.doesnt_match(vocabulario(x)))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [120]:
infiltrados

0       earthquake
1             sask
2         notified
3        wildfires
4            pours
           ...    
7608          http
7609     troubling
7610           5km
7611      collided
7612          http
Name: text, Length: 7613, dtype: object

In [151]:
train_palabras['palabra_infiltrada'] = infiltrados
train_palabras['palabra_infiltrada'].value_counts()

http         1619
https         306
û              89
lol            47
rt             46
             ... 
sabotage        1
ben             1
charminar       1
hostages        1
pharma          1
Name: palabra_infiltrada, Length: 3171, dtype: int64

In [152]:
train_palabras.head()

Unnamed: 0,target,palabra_similar,palabra_diferente,palabra_infiltrada
0,1,we,purva.patel@chron.com,earthquake
1,1,northern,yassen,sask
2,1,they,zety,notified
3,1,10,rungfapaisarn,wildfires
4,1,out,______________________________________________...,pours


In [153]:
# Agrego feature con el mean encoding de las palabras infiltradas
mean_infiltradas = train_palabras.groupby('palabra_infiltrada')['target'].agg(['mean','count'])
mean_infiltradas.count()

mean     3171
count    3171
dtype: int64

In [154]:
mean_infiltradas['count'].mean()

2.400819930621255

In [155]:
# Dejo solo los promedios de las palabras con mas de 5 ocurrencias
mean_infiltradas = mean_infiltradas[mean_infiltradas['count'] > 5].drop(['count'],axis=1)
mean_infiltradas.count()

mean    141
dtype: int64

Son pocos registros como para crear otro feature

In [156]:
sim_train.head()

Unnamed: 0,target,semejanza,diferencia
0,1,0.817225,0.698188
1,1,0.650956,0.585544
2,1,0.837994,0.78514
3,1,0.798241,0.705008
4,1,0.786582,0.727637


In [157]:
sim_train.to_csv('Archivos/train_semejanzas.csv', index=False)

In [24]:
# Calculo la similaridad entre el texto y la palabra infiltrada
sim_train['simil_intruso'] = train['text'].agg(lambda x: word_vectors.n_similarity(vocabulario(x), word_vectors.doesnt_match(vocabulario(x))))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [25]:
sim_train.head()

Unnamed: 0,target,semejanza,diferencia,simil_intruso
0,1,0.817225,0.698188,0.515924
1,1,0.650956,0.585544,0.430637
2,1,0.837994,0.78514,0.503261
3,1,0.798241,0.705008,0.432212
4,1,0.786582,0.727637,0.378606


In [26]:
sim_train.to_csv('Archivos/train_semejanzas.csv', index=False)

#### Repito el mecanismo para el archivo de test

In [158]:
similares_test = test['text'].agg(lambda x: word_vectors.most_similar(positive=vocabulario(x), topn=1))

In [159]:
similares_test

0          [(accident, 0.7852973937988281)]
1               [(come, 0.845439612865448)]
2              [(away, 0.8163744211196899)]
3          [(wildfire, 0.6221873164176941)]
4          [(mainland, 0.6732683777809143)]
                       ...                 
3258         [(center, 0.6763065457344055)]
3259           [(just, 0.8925397396087646)]
3260            [(new, 0.7113810181617737)]
3261    [(environment, 0.7011711597442627)]
3262       [(approved, 0.7105252146720886)]
Name: text, Length: 3263, dtype: object

In [160]:
palabras_similares = [x[0][0] for x in similares_test]
semejanzas = [x[0][1] for x in similares_test]

In [161]:
palabras_similares[:5]

['accident', 'come', 'away', 'wildfire', 'mainland']

In [162]:
semejanzas[0:5]

[0.7852973937988281,
 0.845439612865448,
 0.8163744211196899,
 0.6221873164176941,
 0.6732683777809143]

In [163]:
sim_test = pd.DataFrame({'id':test['id'], 'semejanza':semejanzas})
sim_test.to_csv('Archivos/test_semejanzas.csv', index=False)

In [164]:
sim_test.head()

Unnamed: 0,id,semejanza
0,0,0.785297
1,2,0.84544
2,3,0.816374
3,9,0.622187
4,11,0.673268


In [165]:
test_palabras = pd.DataFrame({'id':test['id'], 'palabra_similar':palabras_similares})
test_palabras['palabra_similar'].value_counts()

so           180
just         169
one          154
you          152
this         137
            ... 
displaced      1
2008           1
refugees       1
2007           1
insurance      1
Name: palabra_similar, Length: 593, dtype: int64

In [166]:
sim_test.head()

Unnamed: 0,id,semejanza
0,0,0.785297
1,2,0.84544
2,3,0.816374
3,9,0.622187
4,11,0.673268


In [167]:
# Encuentro las palabras que menos se acercan a los textos
diferentes_test = test['text'].agg(lambda x: word_vectors.most_similar(negative=vocabulario(x), topn=1))

In [168]:
diferentes_test

0       [(____________________________________________...
1                            [(zety, 0.7198097109794617)]
2                            [(zety, 0.7555083632469177)]
3                           [(30.11, 0.4841735363006592)]
4                          [(hoeyer, 0.5357732176780701)]
                              ...                        
3258                       [(3.6730, 0.6118791103363037)]
3259    [(____________________________________________...
3260    [(____________________________________________...
3261                 [(kallicharran, 0.6488111615180969)]
3262                    [(ambareesh, 0.6341420412063599)]
Name: text, Length: 3263, dtype: object

In [169]:
palabras_diferentes_test = [x[0][0] for x in diferentes_test]
diferencias_test = [x[0][1] for x in diferentes_test]

In [170]:
palabras_diferentes_test[:5]

['___________________________________________________________',
 'zety',
 'zety',
 '30.11',
 'hoeyer']

In [171]:
diferencias_test[0:5]

[0.6520140767097473,
 0.7198097109794617,
 0.7555083632469177,
 0.4841735363006592,
 0.5357732176780701]

In [172]:
sim_test['diferencia'] = diferencias_test

In [173]:
sim_test.head()

Unnamed: 0,id,semejanza,diferencia
0,0,0.785297,0.652014
1,2,0.84544,0.71981
2,3,0.816374,0.755508
3,9,0.622187,0.484174
4,11,0.673268,0.535773


In [174]:
sim_test.to_csv('Archivos/test_semejanzas.csv', index=False)

In [175]:
test_palabras['palabra_diferente'] = palabras_diferentes_test
test_palabras['palabra_diferente'].value_counts()

zety                                                           821
___________________________________________________________    766
brett.clanton@chron.com                                        235
tom.fowler@chron.com                                           125
ryryryryryry                                                   118
                                                              ... 
metemma                                                          1
huckins                                                          1
liepa                                                            1
gédéon                                                           1
useni                                                            1
Name: palabra_diferente, Length: 601, dtype: int64

In [176]:
sim_test.head()

Unnamed: 0,id,semejanza,diferencia
0,0,0.785297,0.652014
1,2,0.84544,0.71981
2,3,0.816374,0.755508
3,9,0.622187,0.484174
4,11,0.673268,0.535773


In [177]:
# Encuentro la palabra que no coincide con el resto del texto
infiltrados_test = test['text'].agg(lambda x: word_vectors.doesnt_match(vocabulario(x)))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [178]:
infiltrados_test

0         terrible
1       earthquake
2            geese
3          spokane
4         soudelor
           ...    
3258     fasteners
3259            ri
3260    derailment
3261           meg
3262     activated
Name: text, Length: 3263, dtype: object

In [179]:
test_palabras['palabra_infiltrada'] = infiltrados_test
test_palabras['palabra_infiltrada'].value_counts()

http       659
https      155
û           33
lol         22
rt          20
          ... 
attila       1
dominos      1
sens         1
harry        1
fire         1
Name: palabra_infiltrada, Length: 1584, dtype: int64

In [29]:
sim_test.to_csv('Archivos/test_semejanzas.csv', index=False)

In [27]:
sim_test = pd.read_csv('Archivos/test_semejanzas.csv')

In [28]:
# Calculo la similaridad entre el texto y la palabra infiltrada
sim_test['simil_intruso'] = test['text'].agg(lambda x: word_vectors.n_similarity(vocabulario(x), word_vectors.doesnt_match(vocabulario(x))))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [29]:
sim_test.head()

Unnamed: 0,id,semejanza,diferencia,simil_intruso
0,0,0.785297,0.652014,0.418288
1,2,0.84544,0.71981,0.513763
2,3,0.816374,0.755508,0.34762
3,9,0.622187,0.484174,0.071788
4,11,0.673268,0.535773,0.259028


In [30]:
sim_test.to_csv('Archivos/test_semejanzas.csv', index=False)