In [1]:
#Utilizacion de Word2Vec junto con palabras_claves

In [2]:
import pandas as pd
pd.set_option('max_colwidth', -1)
import string
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df.drop(columns=['location', 'keyword', 'target'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)
df = df.merge(df_test, how='outer')

In [4]:
#Al igual que en otros notebooks, preprocesamiento previo
# Quitamos las urls
df['text'] = df['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(clean_text)

In [5]:
df.head()

Unnamed: 0,id,text
0,1,deeds reason earthquake may allah forgive us
1,4,forest fire near la ronge sask canada
2,5,residents asked shelter place notified officers evacuation shelter place orders expected
3,6,13000 people receive wildfires evacuation orders california
4,7,got sent photo ruby alaska smoke wildfires pours school


In [6]:
#Para que esto funcione, se requiere que todas las palabras de todos los textos de los tweets esten en una lista y todo esto
#En una lista. Es decir, una lista de listas.
palabras = []
def extraer_palabras(string):
    splitted = string.split(' ')
    lista = []
    for palabra in splitted:
        lista.append(palabra)
    palabras.append(lista)
    
df['text'].apply(extraer_palabras)

#Se genera el modelo. Min_count es la cantidad minima de veces que debe figurar una palabra para ser considerada
word2vec = Word2Vec(palabras,min_count=5,size=300,workers=4)
vocabulary = word2vec.wv.vocab

In [7]:
#La siguiente funcion se encarga de calcular un promedio de similitudes con respecto a la palabra clave. 
def calcular_promedio(text,string):
    split = text.split(' ')
    promedio = 0
    cantidad_de_palabras = 0
    for palabra in split:
        cantidad_de_palabras+=1
        try:
            promedio += word2vec.wv.similarity(string,palabra)
        except KeyError:
            continue
    return promedio/cantidad_de_palabras
#Estas son las principales categorias de accidentes posibles, asi que buscaremos la similitud entre palabras y oraciones
df['similitud_debris'] = df['text'].apply(lambda x: calcular_promedio(x,"debris"))
df['similitud_derailment'] = df['text'].apply(lambda x: calcular_promedio(x,"derailment"))
df['similitud_wreckage'] = df['text'].apply(lambda x: calcular_promedio(x,"wreckage"))
df['similitud_outbreak'] = df['text'].apply(lambda x: calcular_promedio(x,"outbreak"))
df['similitud_typhoon'] = df['text'].apply(lambda x: calcular_promedio(x,"typhoon"))
df['similitud_bombing'] = df['text'].apply(lambda x: calcular_promedio(x,"bombing"))

df

Unnamed: 0,id,text,similitud_debris,similitud_derailment,similitud_wreckage,similitud_outbreak,similitud_typhoon,similitud_bombing
0,1,deeds reason earthquake may allah forgive us,0.714005,0.714166,0.713776,0.713186,0.714178,0.713808
1,4,forest fire near la ronge sask canada,0.714044,0.714173,0.713822,0.713173,0.714182,0.713747
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,0.908708,0.908918,0.908411,0.907641,0.908930,0.908458
3,6,13000 people receive wildfires evacuation orders california,0.874428,0.874638,0.874145,0.873399,0.874646,0.874163
4,7,got sent photo ruby alaska smoke wildfires pours school,0.799687,0.799871,0.799429,0.798729,0.799879,0.799468
5,8,rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires,0.799667,0.799846,0.799407,0.798721,0.799854,0.799404
6,10,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas,0.916213,0.916416,0.915922,0.915131,0.916432,0.915939
7,13,im top hill see fire woods,0.833023,0.833225,0.832753,0.832055,0.833232,0.832754
8,14,theres emergency evacuation happening building across street,0.999670,0.999902,0.999346,0.998498,0.999911,0.999362
9,15,im afraid tornado coming area,0.999545,0.999782,0.999211,0.998347,0.999787,0.999231


In [8]:
#Cada una de las anteriores categorias, posee una serie de palabras mas cercanas. Lo que vamos a evaluar es si alguno de los
#Tweets posee una de estas palabras.
def posee_palabra_mas_cercana(palabra,texto):
    palabras_mas_cercanas = word2vec.wv.most_similar(palabra)
    cantidad = 5
    palabras_cercanas = []
    for palabra in palabras_mas_cercanas:
        palabras_cercanas.append(palabra[0])
    splitted = texto.split(' ')
    for palabra in splitted:
        if palabra in palabras_cercanas:
            return True
    return False
df['cercania_bombing'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("bombing",x))
df['cercania_debris'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("debris",x))
df['cercania_derailment'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("derailment",x))
df['cercania_wreckage'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("wreckage",x))
df['cercania_outbreak'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("outbreak",x))

In [9]:

df.drop(columns=['text']).to_csv('../features/word2vec.csv', index=False)


In [10]:
#Crearemos una columna que posea el vector que representa a cada oracion. Para ello, nos quedaremos con el vector promedio
#entre el maximo y minimo de los vectores.
def obtener_vector(string):
    max = []
    min = []
    vector = []
    largo = 300
    for indice in range(largo):
        max.append(0)
        min.append(1)
    splitted = string.split(' ')
    
    for palabra in splitted:
        try:
            vector = word2vec.wv[palabra]
            for indice in range(largo):
                if max[indice]<vector[indice]:
                    max[indice] = vector[indice]
                if min[indice]>vector[indice]:
                    min[indice] = vector[indice]
        except KeyError:
            continue
    
    
    for indice in range(largo):
        max[indice] = (max[indice]+min[indice])/2
    return max
    
df['vector'] = df['text'].apply(obtener_vector)

In [11]:
#vamos a crear un nuevo dataframe que tenga solamente el vector
columnas = []
for indice in range(300):
    columnas.append(str(indice))
df[columnas] = pd.DataFrame(df['vector'].to_list(), columns=columnas)


In [12]:
columnas.append('id')
df_2 = df[columnas]
df_2.set_index('id',inplace = True)
df_2.to_csv('../features/word2vec_vec.csv', index=False)

Voy a generar los mismos features pero sin el filtrado inicial para ver si cuestiones como los links o los hashtags modifican los resultados

In [13]:
df = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df.drop(columns=['location', 'keyword', 'target'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)
df = df.merge(df_test, how='outer')

In [18]:
palabras = []
def extraer_palabras(string):
    splitted = string.split(' ')
    lista = []
    for palabra in splitted:
        lista.append(palabra)
    palabras.append(lista)
    
    
df['text'].apply(extraer_palabras)
palabras

[['Our',
  'Deeds',
  'are',
  'the',
  'Reason',
  'of',
  'this',
  '#earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us',
  'all'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask.', 'Canada'],
 ['All',
  'residents',
  'asked',
  'to',
  "'shelter",
  'in',
  "place'",
  'are',
  'being',
  'notified',
  'by',
  'officers.',
  'No',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#wildfires',
  'evacuation',
  'orders',
  'in',
  'California',
  ''],
 ['Just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'Ruby',
  '#Alaska',
  'as',
  'smoke',
  'from',
  '#wildfires',
  'pours',
  'into',
  'a',
  'school',
  ''],
 ['#RockyFire',
  'Update',
  '=>',
  'California',
  'Hwy.',
  '20',
  'closed',
  'in',
  'both',
  'directions',
  'due',
  'to',
  'Lake',
  'County',
  'fire',
  '-',
  '#CAfire',
  '#wildfires'],
 ['#flood',
  '#disaster',
  'Heavy',
  'rain',
  'causes',
  'flas

In [15]:
word2vec = Word2Vec(palabras, min_count=2)
vocabulary = word2vec.wv.vocab

In [16]:
df['similitud_debris'] = df['text'].apply(lambda x: calcular_promedio(x,"debris"))
df['similitud_derailment'] = df['text'].apply(lambda x: calcular_promedio(x,"derailment"))
df['similitud_wreckage'] = df['text'].apply(lambda x: calcular_promedio(x,"wreckage"))
df['similitud_outbreak'] = df['text'].apply(lambda x: calcular_promedio(x,"outbreak"))
df['similitud_typhoon'] = df['text'].apply(lambda x: calcular_promedio(x,"typhoon"))
df['similitud_bombing'] = df['text'].apply(lambda x: calcular_promedio(x,"bombing"))

df

Unnamed: 0,id,text,similitud_debris,similitud_derailment,similitud_wreckage,similitud_outbreak,similitud_typhoon,similitud_bombing
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,0.844238,0.844428,0.843913,0.843137,0.842347,0.844461
1,4,Forest fire near La Ronge Sask. Canada,0.713622,0.713535,0.712992,0.712752,0.711887,0.713586
2,5,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,0.817241,0.817398,0.816880,0.816102,0.815351,0.817357
3,6,"13,000 people receive #wildfires evacuation orders in California",0.879307,0.879182,0.878330,0.878049,0.877107,0.879323
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,0.932132,0.932201,0.931823,0.930484,0.929909,0.932135
5,8,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,0.831420,0.831453,0.830810,0.830314,0.829377,0.831470
6,10,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",0.927314,0.927333,0.926615,0.926197,0.925215,0.927365
7,13,I'm on top of the hill and I can see a fire in the woods...,0.930994,0.931312,0.930939,0.929648,0.928638,0.931223
8,14,There's an emergency evacuation happening now in the building across the street,0.998872,0.999076,0.998420,0.997537,0.996428,0.999060
9,15,I'm afraid that the tornado is coming to our area...,0.989260,0.989676,0.989293,0.987620,0.987406,0.989503


In [17]:
df.set_index('id',inplace =True)
df.drop(columns=['text']).to_csv('../features/word2vec_sin_filtrar.csv', index=False)