In [1]:
%autosave 30
%load_ext autoreload
%autoreload 2

Autosaving every 30 seconds


In [3]:
cd ..

/home/huawei123/kwx1991442/code-classification


In [5]:
import argparse
import logging
import time
import os
import numpy as np

from src.params import *
from src.baseline.dataloader import BaselineDataset

logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

def parse_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("--exp-name", type=str, default=EXPERIMENT_NAME,
        help="the name of this experiment")
    parser.add_argument("--method", type=str, choices=METHOD_CHOICES, default=METHOD,
        help="select method of classification")
    parser.add_argument("--seed", type=int, default=SEED, 
        help="seed of the experiment")
    parser.add_argument("--traintestsplit", type=float, default=0.7, 
        help="seed of the experiment")
    parser.add_argument("--mode", type=str, choices=MODE_CHOICES, default=MODE,
        help="eval mode is used to measure accuracy, predict mode to get prediction")
    parser.add_argument("--liblinear-params", type=str, default=LIBLINEAR_PARAMS,
        help="params of LIBLINEAR classifier")
    
    parser.add_argument('--save-model', action='store_true', 
        help="use this flag to save the model")
    parser.add_argument('--save-predictions', action='store_true', 
        help="use this flag to save predictions")
    parser.add_argument('--save-metrics', action='store_true', 
        help="use this flag to save metrics")

    args = parser.parse_args("")
    return args

In [7]:
args = parse_args()

np.random.seed(args.seed)

run_name = f"{args.exp_name}_{args.method}_{args.seed}_{int(time.time())}"
logging.info(f"{run_name=}")

data = BaselineDataset()
logging.info(f"{len(data.vocab)=}")
logging.info(f"Dataset is setup!")

y_train, x_train, y_val, x_val = data.get_input_train_val(args.traintestsplit)
logging.info(f"Data in splitted to train {len(y_train)=} and val {len(y_val)=}.")

2023-08-11 19:05:46,540 - root - INFO - run_name='TEST_baseline_42_1691769946'
100%|██████████| 11327/11327 [00:15<00:00, 751.61it/s]
100%|██████████| 11327/11327 [00:16<00:00, 705.21it/s]
2023-08-11 19:06:21,112 - root - INFO - len(data.vocab)=1306462
2023-08-11 19:06:21,113 - root - INFO - Dataset is setup!
100%|██████████| 11327/11327 [00:21<00:00, 537.62it/s]
2023-08-11 19:06:42,184 - root - INFO - Data in splitted to train len(y_train)=8025 and val len(y_val)=3302.


In [15]:
# Python program to generate word vectors using Word2Vec
 
# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
 
warnings.filterwarnings(action = 'ignore')
 
import gensim
from gensim.models import Word2Vec

In [16]:
# !wget https://www.gutenberg.org/files/11/11-0.txt

In [112]:
#  Reads ‘alice.txt’ file
sample = open("/home/huawei123/kwx1991442/code-classification/notebooks/11-0.txt")
s = sample.read()
 
# Replaces escape character with space
f = s.replace("\n", " ")

In [113]:
f[:1000].split('.')

['\ufeffThe Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll  This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever',
 ' You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www',
 'gutenberg',
 'org',
 ' If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook',
 '  Title: Alice’s Adventures in Wonderland  Author: Lewis Carroll  Release Date: January, 1991 [eBook #11] [Most recently updated: October 12, 2020]  Language: English  Character set encoding: UTF-8  Produced by: Arthur DiBianca and David Widger  *** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***  [Illustration]     Alice’s Adventures in Wonderland  by Lewis Carroll  THE MILLENNIUM FULCRUM EDITION 3',
 '0  Contents   

In [162]:
# data = []
 
# # iterate through each sentence(?) in the file
# for i in sent_tokenize(f):
#     temp = []
     
#     # tokenize the sentence into words
#     for j in word_tokenize(i):
#         temp.append(j.lower())
 
#     data.append(temp)

from nltk import ToktokTokenizer

tokenizer = ToktokTokenizer()

data = tokenizer.tokenize(f.lower())

In [166]:
data = [sentence.split(' ') for sentence in ' '.join(data).split('.')]

In [167]:
data[:50]

[['\ufeffthe',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'alice',
  '’',
  's',
  'adventures',
  'in',
  'wonderland',
  ',',
  'by',
  'lewis',
  'carroll',
  'this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'in',
  'the',
  'united',
  'states',
  'and',
  'most',
  'other',
  'parts',
  'of',
  'the',
  'world',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions',
  'whatsoever'],
 ['',
  'you',
  'may',
  'copy',
  'it',
  ',',
  'give',
  'it',
  'away',
  'or',
  're-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included',
  'with',
  'this',
  'ebook',
  'or',
  'online',
  'at',
  'www'],
 ['gutenberg'],
 ['org'],
 ['',
  'if',
  'you',
  'are',
  'not',
  'located',
  'in',
  'the',
  'united',
  'states',
  ',',
  'you',
  'will',
  'have',
  'to',
  'check',
  'the',
  'laws',
  'of',
  'the',
  'country',
  'where',
  'you',
  'are',
  '

In [181]:
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1,
                              vector_size = 100, window = 5)
# model1.build_vocab(data)  # prepare the model vocabulary
# model1.train(data, total_examples=model1.corpus_count, epochs=model1.epochs)  # train word vectors

2023-08-11 19:57:07,933 - gensim.models.word2vec - INFO - collecting all words and their counts
2023-08-11 19:57:07,934 - gensim.models.word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-08-11 19:57:07,944 - gensim.models.word2vec - INFO - collected 3444 word types from a corpus of 37606 raw words and 1223 sentences
2023-08-11 19:57:07,944 - gensim.models.word2vec - INFO - Creating a fresh vocabulary
2023-08-11 19:57:07,954 - gensim.utils - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 3444 unique words (100.00% of original 3444, drops 0)', 'datetime': '2023-08-11T19:57:07.953984', 'gensim': '4.3.1', 'python': '3.8.10 (default, May 26 2023, 14:05:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.15.0-46-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}
2023-08-11 19:57:07,954 - gensim.utils - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 37606 word corpus (100.00% of original 37606, drops 0)', 'dateti

In [182]:
model1.__dict__.keys()

dict_keys(['vector_size', 'workers', 'epochs', 'train_count', 'total_train_time', 'batch_words', 'sg', 'alpha', 'min_alpha', 'window', 'shrink_windows', 'random', 'hs', 'negative', 'ns_exponent', 'cbow_mean', 'compute_loss', 'running_training_loss', 'min_alpha_yet_reached', 'corpus_count', 'corpus_total_words', 'max_final_vocab', 'max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'cum_table', 'raw_vocab', 'wv', 'hashfxn', 'seed', 'layer1_size', 'comment', 'load', 'effective_min_count', 'lifecycle_events', 'syn1neg'])

In [183]:
# print(data[:1000])

In [184]:
# model1.random
# model1.wv.similarity('Alice', 'wonder')
# model1.wv.most_similar_cosmul()
# model1.wv.similar_by_word('she')
# model1.wv.similar_by_word('alice')
model1.wv.similar_by_word('wonderland')

[('before', 0.9975448846817017),
 ('than', 0.9975435137748718),
 ('some', 0.9974377751350403),
 ('their', 0.9974318146705627),
 ('will', 0.9974080324172974),
 ('about', 0.9973976612091064),
 ('have', 0.9973900318145752),
 (')', 0.9973331689834595),
 ('works', 0.997329831123352),
 ('what', 0.9973255395889282)]

In [186]:
# Print results
print("Cosine similarity between 'she' " +
               "and 'alice' - CBOW : ",
    model1.wv.similarity('she', 'alice'))
     
print("Cosine similarity between 'alice' " +
                 "and 'machines' - CBOW : ",
      model1.wv.similarity('alice', 'machines'))


Cosine similarity between 'she' and 'alice' - CBOW :  0.99962825
Cosine similarity between 'alice' and 'machines' - CBOW :  0.9463065


In [180]:
# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count = 1, vector_size = 100,
                                             window = 5, sg = 1)
 
# Print results
print("Cosine similarity between 'alice' " +
          "and 'wonderland' - Skip Gram : ",
    model2.wv.similarity('alice', 'wonderland'))
     
print("Cosine similarity between 'alice' " +
            "and 'machines' - Skip Gram : ",
      model2.wv.similarity('alice', 'machines'))

2023-08-11 19:55:52,680 - gensim.models.word2vec - INFO - collecting all words and their counts
2023-08-11 19:55:52,681 - gensim.models.word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-08-11 19:55:52,685 - gensim.models.word2vec - INFO - collected 3444 word types from a corpus of 37606 raw words and 1223 sentences
2023-08-11 19:55:52,686 - gensim.models.word2vec - INFO - Creating a fresh vocabulary
2023-08-11 19:55:52,694 - gensim.utils - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 3444 unique words (100.00% of original 3444, drops 0)', 'datetime': '2023-08-11T19:55:52.693963', 'gensim': '4.3.1', 'python': '3.8.10 (default, May 26 2023, 14:05:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.15.0-46-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}
2023-08-11 19:55:52,695 - gensim.utils - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 37606 word corpus (100.00% of original 37606, drops 0)', 'dateti

Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.8295784
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.7687255
