### Building a Word2Vec model using gensim

In [1]:
import gensim

In [110]:
txt = ""
with open("./data/brba-data.txt","r") as f:
    txt = f.readlines()

In [111]:
len(txt)

12900

In [112]:
txt[:10]

['Scene: Desert\n',
 '\n',
 "Walter: My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane Albuquerque, New Mexico, 87104. To all law enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler you are the love of my life. I hope you know that. Walter Junior you're my big man. There are going to be some things. Things that you'll come to learn about me in the next few days. I just want you to know that no matter how it may look, I only had you in my heart. Goodbye.\n",
 '\n',
 'Scene: White Residence\n',
 '(Three weeks earlier)\n',
 '\n',
 'Skyler: Happy Birthday.\n',
 '\n',
 'Walter: Look at that.\n']

In [113]:
txt = [sent.replace('\n',"").replace(":", " says") for sent in txt]

In [114]:
txt[:10]

['Scene says Desert',
 '',
 "Walter says My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane Albuquerque, New Mexico, 87104. To all law enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler you are the love of my life. I hope you know that. Walter Junior you're my big man. There are going to be some things. Things that you'll come to learn about me in the next few days. I just want you to know that no matter how it may look, I only had you in my heart. Goodbye.",
 '',
 'Scene says White Residence',
 '(Three weeks earlier)',
 '',
 'Skyler says Happy Birthday.',
 '',
 'Walter says Look at that.']

In [115]:
len(txt)

12900

In [116]:
txt = [sent for sent in txt if len(sent) > 0]

In [117]:
txt[:10]

['Scene says Desert',
 "Walter says My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane Albuquerque, New Mexico, 87104. To all law enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler you are the love of my life. I hope you know that. Walter Junior you're my big man. There are going to be some things. Things that you'll come to learn about me in the next few days. I just want you to know that no matter how it may look, I only had you in my heart. Goodbye.",
 'Scene says White Residence',
 '(Three weeks earlier)',
 'Skyler says Happy Birthday.',
 'Walter says Look at that.',
 "Skyler says That is veggie bacon. Believe it or not. Zero cholesterol. You won't even taste the difference. What time do you think you'll be home?",
 'Walter says Same time.',
 "Skyler says I don't want him dicking you around tonight. You get paid till 5, you work till 5, no later.",
 'Walter says Hey.']

In [118]:
len(txt)

6460

In [142]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [154]:
story = []

In [155]:
for sent in txt:
    token_sent = sent.lower()
    story.append(simple_preprocess(token_sent,min_len=4))


In [156]:
story[:3]

[['scene', 'says', 'desert'],
 ['walter',
  'says',
  'name',
  'walter',
  'hartwell',
  'white',
  'live',
  'negra',
  'arroyo',
  'lane',
  'albuquerque',
  'mexico',
  'enforcement',
  'entities',
  'this',
  'admission',
  'guilt',
  'speaking',
  'family',
  'skyler',
  'love',
  'life',
  'hope',
  'know',
  'that',
  'walter',
  'junior',
  'there',
  'going',
  'some',
  'things',
  'things',
  'that',
  'come',
  'learn',
  'about',
  'next',
  'days',
  'just',
  'want',
  'know',
  'that',
  'matter',
  'look',
  'only',
  'heart',
  'goodbye'],
 ['scene', 'says', 'white', 'residence']]

In [157]:
len(story)

6460

In [158]:
model = gensim.models.Word2Vec(
    window = 5,
    min_count=2
)

In [159]:
model.build_vocab(story)

In [160]:
model.corpus_count

6460

In [161]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(193390, 291370)

In [162]:
model.wv.most_similar("walter")

[('jesse', 0.9995893836021423),
 ('skyler', 0.9994950294494629),
 ('hank', 0.9994397163391113),
 ('walt', 0.999352216720581),
 ('saul', 0.9993491172790527),
 ('marie', 0.9993295073509216),
 ('jane', 0.9992544651031494),
 ('badger', 0.9991409182548523),
 ('yeah', 0.999056875705719),
 ('steve', 0.9990451335906982)]

In [167]:
model.wv.doesnt_match(['marie','hank','saul'])

'saul'

In [168]:
model.wv["walt"]

array([-0.43887055,  0.34081435,  0.15101771,  0.04315789, -0.1283621 ,
       -0.44947276,  0.37962434,  0.81392366, -0.06860617, -0.40335256,
       -0.06915604, -0.63560337, -0.0413849 ,  0.17727022, -0.09895984,
       -0.2014012 ,  0.26714882, -0.4762287 ,  0.15089296, -0.78362787,
        0.14730045,  0.28750035,  0.33339944,  0.01046642,  0.13457146,
        0.06008428, -0.33925304,  0.02045707, -0.33760065, -0.01528114,
        0.44504243, -0.03814797,  0.26689872, -0.44614285, -0.32718593,
        0.2632043 ,  0.0062761 , -0.3464539 ,  0.0682815 , -0.6870623 ,
       -0.00715544, -0.24094802, -0.41857317, -0.06536856,  0.2464687 ,
       -0.368253  , -0.33499786,  0.03673598,  0.26577032,  0.27274832,
        0.15888363, -0.2058587 , -0.08862621, -0.02151181,  0.21641453,
        0.15759075,  0.10944643, -0.26882002, -0.31333518,  0.05542568,
        0.06373267,  0.07567377,  0.09416491, -0.17766474, -0.5669725 ,
        0.43322015, -0.00336772,  0.3530082 , -0.7198728 ,  0.69

In [169]:
model.wv["walt"].shape

(100,)

In [170]:
model.wv.similarity('walt','walter')

0.9993522

In [177]:
model.wv.similarity('hank','jesse')

0.99928856

Note- This is not a very good training as data was barely processed and its just a transcript of the show so results are not very good, just for learning sake ive done this