In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Data

In [0]:
df_disaster = pd.read_csv('train.csv')
df_disaster.shape

(7613, 5)

In [0]:
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df_disaster['text'], df_disaster['target'], test_size=0.2)

# 2. Spacy word vectors

In [0]:
import spacy
nlp = spacy.load("en_core_web_md")

def display_vectors(text):
  doc = nlp(text)
  for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

def get_vector(text):
  doc = nlp(text)
  return doc.vector #average of the token vectors.

In [0]:
print(nlp('man').similarity(nlp('woman')))
print(nlp('king').similarity(nlp('queen')))
print(nlp('actor').similarity(nlp('actress')))
print(nlp('doctor').similarity(nlp('nurse')))

0.7401744538491297
0.7252610345406867
0.7484467528753997
0.6880643786881521


In [0]:
print(nlp('france').similarity(nlp('paris')))
print(nlp('japan').similarity(nlp('tokyo')))
print(nlp('usa').similarity(nlp('la')))

0.7916328323319856
0.800706948877685
0.24974637037031683


In [0]:
display_vectors(df_disaster['text'].iloc[3587])

Winnipeg True 6.459667 False
police True 7.3889823 False
seek True 5.802574 False
witnesses True 6.5771213 False
in True 5.0929856 False
Arlington True 6.3564496 False
and True 4.6577983 False
William True 6.7178607 False
fatal True 6.661328 False
crash True 6.6807313 False
http://t.co/N2bCf4M64V False 0.0 True


In [0]:
X_train_we = X_train.apply(get_vector)
X_val_we = X_val.apply(get_vector)

In [0]:
X_train_we = pd.DataFrame(X_train_we.tolist())
X_val_we = pd.DataFrame(X_val_we.tolist())
X_train_we.shape, X_val_we.shape

((6090, 300), (1523, 300))

In [0]:
X_train_we.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
5081,-0.081516,0.137667,0.047158,-0.066955,0.117892,0.100264,0.176085,-0.085922,0.002362,0.975046,-0.164599,-0.042957,-0.047453,-0.136181,-0.11007,-0.041855,-0.253748,1.235979,0.115536,0.040104,-0.123364,-0.00424,-0.069473,-0.25594,-0.173225,0.238471,-0.161529,-0.072304,0.034037,0.130991,-0.017803,0.093638,-0.082729,0.201306,-0.071427,-0.116918,0.098256,-0.018394,-0.031072,-0.036375,...,0.144326,0.000561,-0.058697,0.20552,0.116389,-0.088574,0.153458,0.139711,-0.010943,0.020584,-0.26056,0.104517,-0.08195,0.00756,-0.130997,0.107563,-0.021327,-0.090975,0.011279,0.061167,-0.11629,-0.000994,0.185405,0.05916,0.03538,0.027707,0.023836,0.036571,0.01308,-0.097502,-0.093522,0.022787,0.118051,-0.153704,0.208844,-0.169311,-0.004759,-0.082663,-0.114555,0.004344
5758,-0.052561,0.190063,-0.074788,-0.053283,0.029875,-0.09022,0.086876,-0.115823,0.003757,1.966864,-0.193011,-0.024046,0.128216,-0.023825,-0.059378,-0.034237,-0.077747,0.780934,-0.072087,-0.097178,-0.019426,-0.06629,-0.036386,0.06546,0.119368,0.067731,-0.011317,-0.002829,0.078162,-0.009719,-0.031576,0.052267,0.048697,0.0491,0.011517,0.009192,-0.182388,0.095615,-0.064754,0.066247,...,0.011589,0.13703,-0.017862,0.113407,0.08313,-0.097887,0.136377,0.057883,0.211631,-0.130025,9e-06,-0.019518,-0.110055,-0.084601,0.009046,0.03452,0.056147,0.182896,-0.119569,0.210105,0.091271,0.017967,0.036955,-0.001079,0.057208,0.073697,0.050998,0.04346,0.026884,0.067419,-0.16935,0.007242,0.017733,-0.133195,-0.002906,-0.020693,-0.12725,0.007629,-0.010521,0.036336
1682,0.140874,-0.029021,-0.049915,-0.120664,0.127636,0.020661,0.015689,0.156856,-0.135592,1.186285,-0.239191,-0.120886,0.033742,0.019426,-0.106772,-0.03539,0.036074,0.823016,-0.045634,-0.070333,0.121116,-0.098424,0.082502,-0.095856,0.096756,-0.067063,-0.164492,0.075859,-0.092799,0.191781,0.140146,0.07168,-0.126514,0.005881,0.02106,-0.021668,0.050026,0.182197,-0.067946,0.061699,...,0.063373,0.026776,-0.059228,0.191199,-0.048139,-0.095516,-0.030908,0.10195,-0.057712,-0.013521,-0.247572,0.023582,0.055709,0.126077,0.128854,0.016681,0.023398,-0.159658,-0.089787,-0.136099,-0.089266,-0.016581,-0.102863,-0.023974,0.06199,0.020311,0.097904,-0.003077,0.042361,0.033723,-0.02647,-0.026463,-0.017024,0.048407,-0.04328,-0.071264,-0.056487,0.040557,-0.140717,0.082089
5494,-0.227624,0.187711,-0.014611,0.163355,0.060943,0.011739,-0.03989,-0.087764,-0.131447,1.483381,-0.268507,-0.087014,-0.098206,-0.018229,-0.350477,-0.141304,-0.078607,1.083124,0.043478,-0.068898,0.076843,-0.022701,0.076501,0.110489,0.010406,-0.048846,-0.047487,0.001894,0.048635,0.175835,-0.143004,-0.049379,-0.023022,0.04953,-0.063634,0.078642,-0.056384,0.044472,0.061755,0.272416,...,0.098965,0.131487,-0.009691,-0.027884,-0.013798,-0.200838,-0.029467,0.046903,-0.072245,-0.052238,0.103558,0.05146,0.067216,-0.0181,0.089859,0.020468,-0.024731,0.163648,-0.068588,0.023303,-0.054314,-0.014147,0.071747,-0.065753,0.186291,0.100535,0.010419,0.023661,-0.069279,-0.18772,-0.157786,-0.116714,0.108426,-0.073736,-0.003172,-0.225751,-0.055872,-0.158389,-0.114203,0.151461
5955,-0.163305,0.131606,-0.017082,0.004019,-0.093206,-0.003892,-0.021231,-0.166458,0.020036,1.166113,-0.013922,0.253742,0.127557,-0.053482,0.007639,0.057512,-0.057403,1.222748,-0.219217,-0.144139,-0.142976,-0.112124,-0.117768,0.05531,0.150681,0.113004,-0.029543,-0.068641,0.122258,0.04728,-0.077243,-0.052187,-0.086145,0.070125,-0.047727,-0.001461,0.033768,0.255867,-0.021139,0.131784,...,0.18338,0.127509,0.047511,0.043675,0.011609,-0.180713,-0.084524,0.036081,0.469995,-0.04071,-0.013356,-0.054351,-0.100465,-0.050523,0.008416,-0.041062,-0.087567,-0.117191,-0.205635,0.138599,0.158476,0.010989,0.126529,-0.165656,0.145881,-0.077725,-0.030732,-0.107863,0.134263,-0.069149,-0.114707,0.028937,0.115014,0.09964,-0.103545,0.010271,-0.108715,-0.156782,0.057225,0.019538


# 3. Modelamiento

In [0]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def test_model(train, val, y_train, y_val):
    svmodel = svm.SVC()
    svmodel.fit(train,y_train)

    predictions = svmodel.predict(val)
    print("Accuracy Score -> ",accuracy_score(predictions, y_val)*100)
    print("F1 Score -> ",f1_score(predictions, y_val)*100)

In [0]:
test_model(X_train_we, X_val_we, y_train, y_val)

Accuracy Score ->  82.27183191070256
F1 Score ->  77.46243739565944


**¿Que pasa con las palabras fuera del vocabulario?**

In [0]:
doc = nlp('@usatoday')
for token in doc: print(token.text, token.has_vector, token.vector_norm,token.is_oov)

@usatoday False 0.0 True


# 4. Entrenamiento de vectores desde cero

In [0]:
def tokenizer(text):
  doc = nlp(text)
  tokens = [token.text for token in doc]
  return tokens

In [0]:
sents = df_disaster.text.apply(tokenizer)

In [0]:
from gensim.models import Word2Vec

In [0]:
model = Word2Vec(sents, min_count=3, size=100, window=5, sg=1)

In [0]:
print(model.similarity('man', 'woman'))
print(model.similarity('King', 'Queen'))

0.98032683
0.8026181


  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):
  


In [0]:
model.most_similar('@usatoday')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('Case', 0.9983925819396973),
 ('units', 0.9983007907867432),
 ('min', 0.9982388019561768),
 ('MIDO', 0.9982192516326904),
 ('Travel', 0.9981244802474976),
 ('Navy', 0.9981186389923096),
 ('Dramatic', 0.9981140494346619),
 ('Newest', 0.9980981945991516),
 ('Route', 0.9980724453926086),
 ('Complex', 0.9980449676513672)]

In [0]:
def document_vector(sent):
  word_vectors = [model[token] if token in model.wv else np.zeros(100) for token in sent]
  return np.asarray(word_vectors).mean(axis=0) 

In [0]:
X_train_sents = X_train.apply(tokenizer)
X_val_sents = X_val.apply(tokenizer)

X_train_w2v = [document_vector(sent) for sent in X_train_sents]
X_val_w2v = [document_vector(sent) for sent in X_val_sents]

X_train_w2v = np.stack(X_train_w2v)
X_val_w2v = np.stack((X_val_w2v))

  


In [0]:
test_model(X_train_w2v, X_val_w2v, y_train, y_val)

Accuracy Score ->  73.60472751149048
F1 Score ->  63.52087114337568
