In [1]:
#Utilizacion de Word2Vec junto con palabras_claves

In [25]:
import pandas as pd
pd.set_option('max_colwidth', -1)
import string
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df.drop(columns=['location', 'keyword', 'target'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)
df = df.merge(df_test, how='outer')

In [27]:
#Al igual que en otros notebooks, preprocesamiento previo
# Quitamos las urls
df['text'] = df['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(clean_text)

In [28]:
df.head()

Unnamed: 0,id,text
0,1,deeds reason earthquake may allah forgive us
1,4,forest fire near la ronge sask canada
2,5,residents asked shelter place notified officers evacuation shelter place orders expected
3,6,13000 people receive wildfires evacuation orders california
4,7,got sent photo ruby alaska smoke wildfires pours school


In [29]:
#Para que esto funcione, se requiere que todas las palabras de todos los textos de los tweets esten en una lista y todo esto
#En una lista. Es decir, una lista de listas.
palabras = []
def extraer_palabras(string):
    splitted = string.split(' ')
    lista = []
    for palabra in splitted:
        lista.append(palabra)
    palabras.append(lista)
    
df['text'].apply(extraer_palabras)

#Se genera el modelo. Min_count es la cantidad minima de veces que debe figurar una palabra para ser considerada
word2vec = Word2Vec(palabras,min_count=5,size=300,workers=4)
vocabulary = word2vec.wv.vocab

In [30]:
#La siguiente funcion se encarga de calcular un promedio de similitudes con respecto a la palabra clave. 
def calcular_promedio(text,string):
    split = text.split(' ')
    promedio = 0
    cantidad_de_palabras = 0
    for palabra in split:
        cantidad_de_palabras+=1
        try:
            promedio += word2vec.wv.similarity(string,palabra)
        except KeyError:
            continue
    return promedio/cantidad_de_palabras    
df['similitud_debris'] = df['text'].apply(lambda x: calcular_promedio(x,"debris"))
df['similitud_derailment'] = df['text'].apply(lambda x: calcular_promedio(x,"derailment"))
df['similitud_wreckage'] = df['text'].apply(lambda x: calcular_promedio(x,"wreckage"))
df['similitud_outbreak'] = df['text'].apply(lambda x: calcular_promedio(x,"outbreak"))
df['similitud_typhoon'] = df['text'].apply(lambda x: calcular_promedio(x,"typhoon"))
df['similitud_bombing'] = df['text'].apply(lambda x: calcular_promedio(x,"bombing"))
df['similitud_con_I'] = df['text'].apply(lambda x: calcular_promedio(x,"i"))

df

Unnamed: 0,id,text,similitud_debris,similitud_derailment,similitud_wreckage,similitud_outbreak,similitud_typhoon,similitud_bombing,similitud_con_I
0,1,deeds reason earthquake may allah forgive us,0.713977,0.714166,0.713753,0.713318,0.714170,0.713830,0.714168
1,4,forest fire near la ronge sask canada,0.714029,0.714176,0.713814,0.713346,0.714171,0.713771,0.714171
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,0.908716,0.908941,0.908436,0.907876,0.908935,0.908490,0.908940
3,6,13000 people receive wildfires evacuation orders california,0.874406,0.874651,0.874107,0.873649,0.874647,0.874165,0.874645
4,7,got sent photo ruby alaska smoke wildfires pours school,0.799672,0.799879,0.799421,0.798908,0.799882,0.799467,0.799878
5,8,rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires,0.799654,0.799858,0.799402,0.798937,0.799858,0.799404,0.799856
6,10,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas,0.916238,0.916474,0.915951,0.915380,0.916474,0.915997,0.916463
7,13,im top hill see fire woods,0.833009,0.833239,0.832753,0.832208,0.833239,0.832778,0.833249
8,14,theres emergency evacuation happening building across street,0.999642,0.999900,0.999331,0.998692,0.999901,0.999374,0.999904
9,15,im afraid tornado coming area,0.999568,0.999841,0.999246,0.998615,0.999840,0.999305,0.999846


In [31]:
#Vamos a crear una lista de features basados en si el texto en cuestion, posee alguna de las palabras mas similares a la 
#palabra en cuestion
def posee_palabra_mas_cercana(palabra,texto):
    palabras_mas_cercanas = word2vec.wv.most_similar(palabra)
    cantidad = 5
    palabras_cercanas = []
    for palabra in palabras_mas_cercanas:
        palabras_cercanas.append(palabra[0])
    splitted = texto.split(' ')
    for palabra in splitted:
        if palabra in palabras_cercanas:
            return True
    return False
df['cercania_bombing'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("bombing",x))
df['cercania_debris'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("debris",x))
df['cercania_derailment'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("derailment",x))
df['cercania_wreckage'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("wreckage",x))
df['cercania_outbreak'] = df['text'].apply(lambda x:posee_palabra_mas_cercana("outbreak",x))

In [32]:
#df.set_index('id',inplace =True)
df.drop(columns=['text']).to_csv('../features/word2vec.csv', index=False)


Utilicemos el vector que nos da word2vec

In [33]:
def obtener_vector_minimo(string):
    max = []
    min = []
    vector = []
    largo = 300
    for indice in range(largo):
        max.append(0)
        min.append(1)
    splitted = string.split(' ')
    
    for palabra in splitted:
        try:
            vector = word2vec.wv[palabra]
            for indice in range(largo):
                if max[indice]<vector[indice]:
                    max[indice] = vector[indice]
                if min[indice]>vector[indice]:
                    min[indice] = vector[indice]
        except KeyError:
            continue
    
    
    for indice in range(largo):
        max[indice] = (max[indice]+min[indice])/2
    return max
    
df['vector'] = df['text'].apply(obtener_vector_minimo)

In [34]:
#vamos a crear un nuevo dataframe que tenga solamente el vector
columnas = []
for indice in range(300):
    columnas.append(str(indice))
df[columnas] = pd.DataFrame(df['vector'].to_list(), columns=columnas)


Unnamed: 0,id,text,similitud_debris,similitud_derailment,similitud_wreckage,similitud_outbreak,similitud_typhoon,similitud_bombing,similitud_con_I,cercania_bombing,...,290,291,292,293,294,295,296,297,298,299
0,1,deeds reason earthquake may allah forgive us,0.713977,0.714166,0.713753,0.713318,0.714170,0.713830,0.714168,False,...,0.040513,-0.125959,-0.126642,0.118410,-0.081137,0.215906,0.227337,-0.148360,0.224014,0.101420
1,4,forest fire near la ronge sask canada,0.714029,0.714176,0.713814,0.713346,0.714171,0.713771,0.714171,False,...,0.043816,-0.131888,-0.131770,0.127149,-0.084561,0.229661,0.242890,-0.153219,0.235863,0.109503
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,0.908716,0.908941,0.908436,0.907876,0.908935,0.908490,0.908940,False,...,0.032297,-0.096270,-0.094962,0.098802,-0.060494,0.181570,0.189768,-0.112300,0.185829,0.084436
3,6,13000 people receive wildfires evacuation orders california,0.874406,0.874651,0.874107,0.873649,0.874647,0.874165,0.874645,False,...,0.036226,-0.123700,-0.122763,0.109165,-0.078441,0.199222,0.209509,-0.143529,0.205923,0.094308
4,7,got sent photo ruby alaska smoke wildfires pours school,0.799672,0.799879,0.799421,0.798908,0.799882,0.799467,0.799878,False,...,0.034778,-0.112177,-0.111213,0.108170,-0.069639,0.200038,0.210107,-0.133071,0.205323,0.092914
5,8,rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires,0.799654,0.799858,0.799402,0.798937,0.799858,0.799404,0.799856,False,...,0.042872,-0.131888,-0.131770,0.125749,-0.084561,0.228310,0.240722,-0.153219,0.233802,0.107875
6,10,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas,0.916238,0.916474,0.915951,0.915380,0.916474,0.915997,0.916463,False,...,0.034881,-0.134677,-0.135531,0.140075,-0.084350,0.236067,0.242613,-0.154561,0.241616,0.109106
7,13,im top hill see fire woods,0.833009,0.833239,0.832753,0.832208,0.833239,0.832778,0.833249,False,...,0.044661,-0.131888,-0.131770,0.128849,-0.084561,0.234542,0.247456,-0.153219,0.240994,0.111180
8,14,theres emergency evacuation happening building across street,0.999642,0.999900,0.999331,0.998692,0.999901,0.999374,0.999904,False,...,0.047865,-0.154431,-0.152669,0.145025,-0.096448,0.270486,0.283243,-0.178415,0.276549,0.126310
9,15,im afraid tornado coming area,0.999568,0.999841,0.999246,0.998615,0.999840,0.999305,0.999846,False,...,0.037067,-0.120532,-0.119618,0.112189,-0.076889,0.203990,0.213187,-0.139327,0.208888,0.096649


In [38]:
columnas.append('id')
df_2 = df[columnas]
df_2.set_index('id',inplace = True)
df_2.to_csv('../features/word2vec_vec.csv', index=False)

Voy a generar los mismos features pero sin el filtrado inicial para ver si cuestiones como los links o los hashtags modifican los resultados

In [10]:
df = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df.drop(columns=['location', 'keyword', 'target'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)
df = df.merge(df_test, how='outer')

In [12]:
palabras = []
def extraer_palabras(string):
    splitted = string.split(' ')
    lista = []
    for palabra in splitted:
        lista.append(palabra)
    palabras.append(lista)
    
df['text'].apply(extraer_palabras)

[['Our',
  'Deeds',
  'are',
  'the',
  'Reason',
  'of',
  'this',
  '#earthquake',
  'May',
  'ALLAH',
  'Forgive',
  'us',
  'all'],
 ['Forest', 'fire', 'near', 'La', 'Ronge', 'Sask.', 'Canada'],
 ['All',
  'residents',
  'asked',
  'to',
  "'shelter",
  'in',
  "place'",
  'are',
  'being',
  'notified',
  'by',
  'officers.',
  'No',
  'other',
  'evacuation',
  'or',
  'shelter',
  'in',
  'place',
  'orders',
  'are',
  'expected'],
 ['13,000',
  'people',
  'receive',
  '#wildfires',
  'evacuation',
  'orders',
  'in',
  'California',
  ''],
 ['Just',
  'got',
  'sent',
  'this',
  'photo',
  'from',
  'Ruby',
  '#Alaska',
  'as',
  'smoke',
  'from',
  '#wildfires',
  'pours',
  'into',
  'a',
  'school',
  ''],
 ['#RockyFire',
  'Update',
  '=>',
  'California',
  'Hwy.',
  '20',
  'closed',
  'in',
  'both',
  'directions',
  'due',
  'to',
  'Lake',
  'County',
  'fire',
  '-',
  '#CAfire',
  '#wildfires'],
 ['#flood',
  '#disaster',
  'Heavy',
  'rain',
  'causes',
  'flas

In [16]:
word2vec = Word2Vec(palabras, min_count=2)
vocabulary = word2vec.wv.vocab

In [17]:
df['similitud_debris'] = df['text'].apply(lambda x: calcular_promedio(x,"debris"))
df['similitud_derailment'] = df['text'].apply(lambda x: calcular_promedio(x,"derailment"))
df['similitud_wreckage'] = df['text'].apply(lambda x: calcular_promedio(x,"wreckage"))
df['similitud_outbreak'] = df['text'].apply(lambda x: calcular_promedio(x,"outbreak"))
df['similitud_typhoon'] = df['text'].apply(lambda x: calcular_promedio(x,"typhoon"))
df['similitud_bombing'] = df['text'].apply(lambda x: calcular_promedio(x,"bombing"))

df

Unnamed: 0_level_0,text,similitud_debris,similitud_derailment,similitud_wreckage,similitud_outbreak,similitud_typhoon,similitud_bombing
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,0.767058,0.767511,0.766904,0.765804,0.765712,0.767206
4,Forest fire near La Ronge Sask. Canada,0.713589,0.713516,0.712684,0.712719,0.712315,0.713464
5,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,0.816312,0.816661,0.815865,0.815091,0.814910,0.816430
6,"13,000 people receive #wildfires evacuation orders in California",0.758228,0.757769,0.756669,0.757675,0.757376,0.758177
7,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,0.811814,0.811952,0.811147,0.810491,0.810505,0.811808
8,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,0.830705,0.830707,0.829812,0.829880,0.829240,0.830672
10,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",0.927068,0.927126,0.926140,0.926117,0.925466,0.927137
13,I'm on top of the hill and I can see a fire in the woods...,0.928274,0.929187,0.928685,0.926272,0.926597,0.928545
14,There's an emergency evacuation happening now in the building across the street,0.997646,0.998076,0.997354,0.996059,0.995842,0.997792
15,I'm afraid that the tornado is coming to our area...,0.895595,0.896452,0.895861,0.893562,0.893914,0.895779


In [39]:
df.set_index('id',inplace =True)
df.drop(columns=['text']).to_csv('../features/word2vec_sin_filtrar.csv', index=False)