In [1]:
#Utilizacion de Word2Vec 

In [2]:
import pandas as pd
pd.set_option('max_colwidth', -1)
import string
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df.drop(columns=['location', 'keyword', 'target'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)
df = df.merge(df_test, how='outer')

In [4]:
#Al igual que en otros notebooks, preprocesamiento previo
# Quitamos las urls
df['text'] = df['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,id,text
0,1,deeds reason earthquake may allah forgive us
1,4,forest fire near la ronge sask canada
2,5,residents asked shelter place notified officers evacuation shelter place orders expected
3,6,13000 people receive wildfires evacuation orders california
4,7,got sent photo ruby alaska smoke wildfires pours school


In [9]:
#Para que esto funcione, se requiere que todas las palabras de todos los textos de los tweets esten en una lista y todo esto
#En una lista. Es decir, una lista de listas.
palabras = []
def extraer_palabras(string):
    splitted = string.split(' ')
    lista = []
    for palabra in splitted:
        lista.append(palabra)
    palabras.append(lista)
    
df['text'].apply(extraer_palabras)

#Se genera el modelo. Min_count es la cantidad minima de veces que debe figurar una palabra para ser considerada
word2vec = Word2Vec(palabras, min_count=2)
vocabulary = word2vec.wv.vocab

In [8]:
#La siguiente funcion se encarga de calcular un promedio de similitudes con respecto a la palabra clave. 
def calcular_promedio(text,string):
    split = text.split(' ')
    promedio = 0
    cantidad_de_palabras = 0
    for palabra in split:
        cantidad_de_palabras+=1
        try:
            promedio += word2vec.wv.similarity(string,palabra)
        except KeyError:
            continue
    return promedio/cantidad_de_palabras    
    
df['similitud_debris'] = df['text'].apply(lambda x: calcular_promedio(x,"debris"))
df['similitud_derailment'] = df['text'].apply(lambda x: calcular_promedio(x,"derailment"))
df['similitud_wreckage'] = df['text'].apply(lambda x: calcular_promedio(x,"wreckage"))
df['similitud_outbreak'] = df['text'].apply(lambda x: calcular_promedio(x,"outbreak"))
df['similitud_typhoon'] = df['text'].apply(lambda x: calcular_promedio(x,"typhoon"))
df['similitud_bombing'] = df['text'].apply(lambda x: calcular_promedio(x,"bombing"))

df

Unnamed: 0,id,text,similitud_debris,similitud_derailment,similitud_wreckage,similitud_outbreak,similitud_typhoon,similitud_bombing
0,1,deeds reason earthquake may allah forgive us,0.996024,0.996553,0.995077,0.993587,0.996509,0.996207
1,4,forest fire near la ronge sask canada,0.713856,0.714046,0.713263,0.712062,0.714041,0.713694
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,0.908261,0.908685,0.907431,0.905895,0.908679,0.908250
3,6,13000 people receive wildfires evacuation orders california,0.996599,0.996907,0.995615,0.994480,0.996823,0.996434
4,7,got sent photo ruby alaska smoke wildfires pours school,0.799384,0.799692,0.798665,0.797363,0.799692,0.799360
5,8,rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires,0.865659,0.865933,0.864943,0.863562,0.865913,0.865511
6,10,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas,0.915738,0.916136,0.914917,0.913453,0.916122,0.915702
7,13,im top hill see fire woods,0.995398,0.995900,0.994533,0.992825,0.995885,0.995404
8,14,theres emergency evacuation happening building across street,0.999394,0.999787,0.998523,0.996834,0.999792,0.999294
9,15,im afraid tornado coming area,0.999007,0.999431,0.998123,0.996602,0.999441,0.998934


In [71]:
df.set_index('id',inplace =True)
df.drop(columns=['text']).to_csv('../features/word2vec.csv', index=False)