In [1]:
import pyodbc
import re

import numpy as np
import pandas as pd
import tensorflow as tf

from sqlalchemy import create_engine
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.models.word2vec import Word2Vec

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from googletrans import Translator

from FlaskApp.transform import *

from job_model import JobModel
from app_model import AppModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
translator = Translator(service_urls=['translate.googleapis.com'])
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [4]:
user = 'huda'
password = 'Vancha12'
host = '127.0.0.1'
port = 1433
database = 'HRSystemDB'


def get_connection():         
    return create_engine(
        url=f"mssql+pyodbc://{user}:{password}@{host}:{port}/{database}?driver=SQL Server",
    )

engine = get_connection()
conn = engine.connect()

In [53]:
applicant_id = 31790

df_job = pd.DataFrame(engine.execute(
    """
    SELECT Job.JobID, Job.UsiaMax, Job.SalaryMin, Job.SalaryMax, City.Name AS CityName, Province.Name AS ProvinceName, EducationLevel.EducationLevelName, Major.MajorName, Job.DriverLicenseType, Job.UsingGlasses, Job.Gender, Job.MaritalStatus, Job.JobTitle, FunctionPosition.FunctionPositionName, Job.Description, Job.Requirement
    FROM (((((Job
    RIGHT JOIN FunctionPosition ON Job.FunctionPositionID = FunctionPosition.FunctionPositionID)
    RIGHT JOIN EducationLevel ON Job.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN City ON Job.CityID = City.CityID)
    RIGHT JOIN Province ON Job.ProvinceID = Province.ProvinceID)
    RIGHT JOIN Major ON Job.MajorID = Major.MajorID)
    """
))

df_function = pd.DataFrame(engine.execute(
    """
    SELECT FunctionPositionID, FunctionPositionName
    FROM FunctionPosition
    """
))

df_education = pd.DataFrame(engine.execute(
    """
    SELECT EducationLevelID, EducationLevelName
    FROM EducationLevel
    """
))

df_city = pd.DataFrame(engine.execute(
    """
    SELECT CityID, Name AS CityName
    FROM City
    """
))

df_province = pd.DataFrame(engine.execute(
    """
    SELECT ProvinceID, Name AS ProvinceName
    FROM Province
    """
))

df_major = pd.DataFrame(engine.execute(
    """
    SELECT MajorID, MajorName
    FROM Major
    """
))

df_applicant = pd.DataFrame(engine.execute(
    f"""
    SELECT Applicant.ApplicantID, Applicant.Dob, Applicant.ExpectedSalary, City.Name AS CityName, Province.Name AS ProvinceName, Applicant.DriverLicenseType, Applicant.IsUsingGlasses, Applicant.Gender, Applicant.MaritalStatus, Applicant.Strengthness
    FROM (((Applicant
    RIGHT JOIN City ON Applicant.CurrentAddressCityID = City.CityID)
    RIGHT JOIN Province ON Applicant.CurrentAddressProvinceID = Province.ProvinceID)
    LEFT JOIN Pipeline ON Applicant.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_education = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantEducation.ApplicantID, ApplicantEducation.DateStart, ApplicantEducation.DateEnd, EducationLevel.EducationLevelName, Major.MajorName
    FROM (((ApplicantEducation
    RIGHT JOIN EducationLevel ON ApplicantEducation.EducationLevelID = EducationLevel.EducationLevelID)
    RIGHT JOIN Major ON ApplicantEducation.MajorID = Major.MajorID)
    LEFT JOIN Pipeline ON ApplicantEducation.ApplicantID = Pipeline.ApplicantID)
    """
))

df_applicant_experience = pd.DataFrame(engine.execute(
    f"""
    SELECT ApplicantExperience.ApplicantID, ApplicantExperience.DateFrom, ApplicantExperience.DateTo, ApplicantExperience.Position, ApplicantExperience.JobDescription
    FROM (ApplicantExperience
    LEFT JOIN Pipeline ON ApplicantExperience.ApplicantID = Pipeline.ApplicantID)
    """
))

In [59]:
df_job.Description = df_job.Description.map(clean_text)

In [62]:
for i in df_job.Description:
    print(i)

under asst front office manager at gunawangsa hotel merr surabaya
becoming book keeper for gunawangsa hotel merr
developing of android application and their integration with back end services in accordance with user requirements analyzing and optimizing application code for efficiency reability and performance
melakukan tugas kesekretariatan menggunakan terminologi prosedur dan dokumen hukum menyiapkan dokumen dan korespondensi hukum seperti surat panggilan keluhan isyarat dan panggilan dari pengadilan menyiapkan dan memproses dokumen dan surat surat hukum seperti surat panggilan panggilan dari pengadilan keluhan permohonan banding mosi dan perjanjian praperadilan mengorganisir dan memelihara perpustakaan hukum dokumen dan file kasus membuat jadwal dan janji
bantu handle pekerjaan atasan membuat surat menyurat mengatur jadwal atasan
mencari para calon klien yang prospektif mengembangkan dan merealisasikan rencana penjualan dan tergetnya melakukan penawaran harga dan negosiasi dengan kl

In [None]:
englishwords = {
    'under': 'bawah',
    'asst': 'asisten',
    'front': 'depan',
    'office': 'kantor',
    'manager': 'manajer',
    'at': 'pada',
    'becoming': 'menjadi',
    'book': 'buku',
    'keeper': 'pemegang',
    'for': 'untuk',
    'developing': 'mengembangkan',
    'of': 'dari',
    'application': 'aplikasi',
    'their': 'mereka',
    'integration': 'integrasi',
    'with': 'dengan',
    'back end': 'back-end',
    'services': 'layanan',
    'in': 'di',
    'accordance': 'sesuai',
    'user': 'pengguna',
    'requirements': 'persyaratan',
    'analyzing': 'analisis',
    'and': 'dan',
    'optimizing': 'optimasi',
    'code': 'kode',
    'efficiency': 'efisiensi',
    'reability': 'reabilitas',
}

In [65]:
# pendekatan baru: tokenizing kemudian ditranslate satu-satu dan di cek satu-satu terhadap kata yang gagal di translate kemudian membuat set dari hasil translate token tesebut
description_corpus = []
for token in df_job.Description.values:
    description_corpus.append(token)
else:
    
    description_corpus = set(description_corpus)

In [66]:
description_corpus

{'',
 '123',
 'able to work children age 2 6 years old make teaching and administrative reports to the management develop supervise and implement applicable programs follow applicable teaching instructions evaluate and maintain student development performs planning teaching and assessing students',
 'achieves the sales and business development targets as states in the annual sales plan create maintain and expand contact with potential clients secure accurate prospect new clients by performing market segmentation and apply different organization approaches in targeting and selecting clients build strong customer relationship and new customer partnership bring forward new initiatives for continuous improvement and win loss analysis',
 'administrasi dokumen legal secara softcopy dan hardcopy membuat catatan dan melengkapi pengarsipan memo internal memo keluar legal ke dept lain membuat catatan dan melengkapi pengarsipan surat perintah kerja subkontraktor agency membuat catatan dan melengk

In [46]:
stopwords = [i[0] for i in pd.read_csv('data/stopwords.csv').astype(str).values] + ['dst']

slangwords = {
    'max': 'maksimal',
    'div': 'divisi',
    'tenant': 'sewa',
    'hokum': 'hukum',
    'branding': 'promosi',
    'stok': 'sedia',
    'termasukaplikasikomputer': 'komputer',
    'telekomunikais': 'telekomunikasi',
    'perinci': 'rinci',
    'risalah': 'surat',
    'mereview': 'ulas',
    'klien': 'langgan',
    'thn': 'tahun',
    'th': 'tahun',
    'min': 'minimal',
    'miniman': 'minimal',
    'jurnalid': 'jurnalis',
    'mayob': 'myob',
    'paja': 'pajak',
    'perban': 'bank',
    'maksimum': 'maksimal',
    'minimum': 'minimal',
    'bhs': 'bahasa',
    'pengorganisasia': 'organisasi',
    'analitis': 'analisis',
    'analisa': 'analisis',
    'memimiliki': 'milik',
    'manejer': 'manajer',
    'menejer': 'manajer',
    'bussines': 'bisnis',
    'development': 'kembang',
    'erpikir': 'pikir',
    'update': 'baru',
    'content': 'konten',
    'plan': 'rencana',
    'memposting': 'unggah',
    'utilitas': 'guna',
    'ms': 'microsoft',
    'perbankan': 'bank',
    'spv': 'supervisor',
    'komunikatif': 'komunikasi',
    'perencanaan': 'rencana',
    'operasionalisasi': 'operasional',
    'diatahkan': 'arah',
    'account': 'akun',
    'update': 'baru',
    'promote': 'promosi',
    'follower': 'ikut',
    'ig': 'instagram',
    'perijinan': 'izin',
    'analisa': 'analisis',
    'kuasi': 'kuasa',
    'karyawannya': 'karyawan',
    'degan': 'dengan',
    'maupuan': 'maupun',
    

    'social': 'sosial',
    'brand': 'merek',
    'order': 'pesan',
    'driver': 'supir',
    'mengkoordinir': 'koordinasi',
    'mensupport': 'dukung',
    'padapenyediaan': 'sedia',
    'file': 'berkas',
    'mesan': 'pesan',
    'base': 'dasar',
    'maximum': 'maksimal',
    'accounting': 'akuntansi',
    'system': 'sistem',
    'building': 'bangun',
    'maintenance': 'pelihara',
    'team': 'tim',
    'deadline': 'tenggat',
    'vehicle': 'kendara',
    'networking': 'jaring',
    'installation': 'instalasi',
    'trouble': 'masalah',
    'presentation': 'presentasi',
    'customer': 'langgan',
    'satisfaction': 'puas',
    'planning': 'rencana',
    'tax': 'pajak',
    'mobile': 'seluler',
    'fast': 'cepat',
    'solving': 'pecah',
    'decision': 'putus',
    'making': 'buat',
    'marketing': 'pasar',
    'communication': 'komunikasi',
    'negotiation': 'negosiasi',
    'problem': 'masalah',
    'leasing': 'sewa',
    'service': 'layan',
    'leadership': 'pimpin',
    'manager': 'manajemen',
    'auditing': 'periksa',
    'auditor': 'periksa',
    'audit': 'periksa',
    'apartment': 'apartemen',
    'engineering': 'teknik',
    'chief': 'pimpin',
    'general': 'umum',
    'research': 'teliti',
    'trend': 'cenderung',
    'cash': 'uang',
    'flow': 'alir',
    'payment': 'bayar',
    'supplier': 'pasok',
    'finance': 'uang',
    'journal': 'jurnal',
    'standing': 'duduk',
    'charge': 'biaya',
    'cleaning': 'bersih',
    'lift': 'angkat',
    'adjustment': 'atur',
    'correction': 'koreksi',
    'schedule': 'jadwal',
    'cust': 'langgan',

}

In [635]:
'''applicant'''
df_applicant = df_applicant.drop_duplicates()
df_applicant = df_applicant.fillna('')

df_applicant['Age'] = pd.to_datetime(
    df_applicant.Dob.map(pick_date).apply(lambda x: filter_date(x, 1958, 2006))
).map(get_age)

df_applicant.drop(columns=['Dob'], inplace=True)

df_applicant.Age = df_applicant.Age.fillna(0).astype(int)

In [406]:
'''education'''
df_applicant_education = df_applicant_education.fillna('')
df_applicant_education.DateStart = pd.to_datetime(
    df_applicant_education.DateStart.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education.DateEnd = pd.to_datetime(
    df_applicant_education.DateEnd.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_education = df_applicant_education[~(df_applicant_education.DateStart.isna()) & ~(df_applicant_education.DateEnd.isna())]
df_applicant_education = df_applicant_education.sort_values('DateStart').groupby(['ApplicantID']).agg('last')

df_applicant_education.drop(columns=['DateStart', 'DateEnd'], inplace=True)

In [407]:
'''experience'''
df_applicant_experience = df_applicant_experience.fillna('')
df_applicant_experience.DateFrom = pd.to_datetime(
    df_applicant_experience.DateFrom.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience.DateTo = pd.to_datetime(
    df_applicant_experience.DateTo.map(pick_date).apply(lambda x: filter_date(x, 1980, 2023))
)

df_applicant_experience = df_applicant_experience[~(df_applicant_experience.DateFrom.isna()) & ~(df_applicant_experience.DateTo.isna())]

# add YearsOfExperience column
df_applicant_experience['YearsOfExperience'] = substract_months(
    df_applicant_experience.DateFrom, df_applicant_experience.DateTo
)

df_applicant_experience = df_applicant_experience.sort_values('DateFrom').groupby(['ApplicantID']).agg({
    'DateFrom': 'last',
    'DateTo': 'last',
    'JobDescription': ' '.join,
    'Position': ' '.join,
    'YearsOfExperience': 'sum',
})

df_applicant_experience.drop(columns=['DateFrom', 'DateTo'], inplace=True)

In [408]:
'''merge'''
df_applicant = pd.merge(df_applicant, df_applicant_experience, on=['ApplicantID'])
df_applicant = pd.merge(df_applicant, df_applicant_education, on=['ApplicantID'])

In [None]:
'''PRE-PROCESSING APPLICANT'''
df_applicant.set_index(['ApplicantID'], inplace=True)

app_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'Position', 'JobDescription', 'Strengthness']
app_num = ['Age', 'ExpectedSalary']
app_bol = ['IsUsingGlasses']

df_applicant = df_applicant[app_num + app_bol + app_str]

'''str'''
df_applicant[app_str] = df_applicant[app_str].applymap(str.lower)

df_applicant.JobDescription = df_applicant.JobDescription.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))
df_applicant.Strengthness = df_applicant.Strengthness.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

df_applicant.Position = df_applicant.Position.apply(lambda x: translate_teks(translator, x))
df_applicant.MajorName = df_applicant.MajorName.apply(lambda x: translate_teks(translator, x))

# concat
df_applicant.JobDescription = df_applicant.JobDescription.str.cat(
    df_applicant.Strengthness, sep=' '
)
df_applicant.rename(columns={'JobDescription': 'DescriptionStrengthness'}, inplace=True)
df_applicant.drop(columns=['Strengthness'], inplace=True)

'''bool'''
df_applicant.IsUsingGlasses = df_applicant.IsUsingGlasses.astype(str).map(str.lower)

In [49]:
df_job.isna().sum()

UsiaMax                 0
SalaryMin               0
SalaryMax               0
CityName                0
ProvinceName            0
EducationLevelName      0
MajorName               0
DriverLicenseType       0
UsingGlasses            0
Gender                  0
MaritalStatus           0
JobTitle                0
FunctionPositionName    0
Description             0
Requirement             0
dtype: int64

In [58]:
job_str = ['CityName', 'ProvinceName', 'EducationLevelName', 'MajorName', 'DriverLicenseType', 'Gender', 'MaritalStatus', 'JobTitle', 'FunctionPositionName', 'Description', 'Requirement']
job_num = ['UsiaMax', 'SalaryMin', 'SalaryMax']
job_bol = ['UsingGlasses']

'''general'''
df_job.set_index(['JobID'], inplace=True)
df_job.fillna('', inplace=True)

'''str'''
df_job[job_str] = df_job[job_str].applymap(str.lower)
df_job.replace('none', '', inplace=True)

In [50]:
def remove_stopwords(stopwords, texts):
    split_text = texts.split()
    clean_text = []
    for text in split_text:
        if text not in stopwords:
            clean_text.append(text)
    return ' '.join(clean_text)

In [51]:
df_job.Description = df_job.Description.map(clean_text).map(stemmer.stem).apply(lambda x: remove_stopwords(stopwords, x))

In [63]:
stemmer.stem('karyawannya')

'karyawannya'

In [52]:
for i in df_job.Description.values:
    print(i)

under asst front office manager at surabaya
book keeper for
developing of android application and their integration with back end services in accordance with user requirements analyzing and optimizing application code for efficiency reability and performance
laku tugas sekretariat terminologi prosedur dokumen hukum dokumen korespondensi hukum surat panggil keluh isyarat panggil adil proses dokumen surat surat hukum surat panggil panggil adil keluh mohon banding mosi janji praperadilan organisir pelihara pustaka hukum dokumen file jadwal janji
bantu handle kerja surat surat atur jadwal
cari calon klien prospektif kembang realisasi rencana jual tergetnya laku tawar harga negosiasi klien
hitung rencana anggar biaya rab hitung harga upah kerja progress kerja time schedule
aplikasi butuh usaha masuk existing aplikasi wujud desain web system fungsi
rawat bersih mobil kantor jemput karyawan kantor
description testing job kerja aplicant source
rab proyek jalan usaha mecakup spesifikasi volume 

In [31]:
for i in df_job.Description.values:
    print(i)

under asst front office manager at gunawangsa hotel merr surabaya
becoming book keeper for gunawangsa hotel merr
developing of android application and their integration with back end services in accordance with user requirements analyzing and optimizing application code for efficiency reability and performance
melakukan tugas kesekretariatan menggunakan terminologi prosedur dan dokumen hukum menyiapkan dokumen dan korespondensi hukum seperti surat panggilan keluhan isyarat dan panggilan dari pengadilan menyiapkan dan memproses dokumen dan surat surat hukum seperti surat panggilan panggilan dari pengadilan keluhan permohonan banding mosi dan perjanjian praperadilan mengorganisir dan memelihara perpustakaan hukum dokumen dan file kasus membuat jadwal dan janji
bantu handle pekerjaan atasan membuat surat menyurat mengatur jadwal atasan
mencari para calon klien yang prospektif mengembangkan dan merealisasikan rencana penjualan dan tergetnya melakukan penawaran harga dan negosiasi dengan kl

In [None]:
"""PRE-PROCESSING JOB"""
df_job.JobTitle = df_job.JobTitle.apply(lambda x: translate_teks(translator, x))
df_job.FunctionPositionName = df_job.FunctionPositionName.apply(lambda x: translate_teks(translator, x))
df_job.MajorName = df_job.MajorName.apply(lambda x: translate_teks(translator, x))

df_job.Description = df_job.Description.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))
df_job.Requirement = df_job.Requirement.map(clean_text).map(maintain_alphabet).map(remove_single).map(remove_morespace).apply(lambda x: translate_teks(translator, x)).apply(lambda x: stemmer_words(stemmer, x)).apply(lambda x: x.split(' ')).apply(lambda x: change_slangwords(slangwords, x)).apply(lambda x: [i for i in x if i not in stopwords]).apply(lambda x: ' '.join(x))

# concatenate JobTitle and FunctionPositionName to be textual feature together
df_job.JobTitle = df_job.JobTitle.str.cat(
    df_job.FunctionPositionName, sep=' '
)
df_job.rename(columns={'JobTitle': 'JobTitlePosition'}, inplace=True)
df_job.drop(columns=['FunctionPositionName'], inplace=True)

# concatenate Description and Requirement to be textual feature together
df_job.Description = df_job.Description.str.cat(
    df_job.Requirement, sep=' '
)
df_job.rename(columns={'Description': 'DescriptionRequirement'}, inplace=True)
df_job.drop(columns=['Requirement'], inplace=True)

df_job = df_job[~(df_job.DescriptionRequirement == ' ')]

'''int'''
df_job[job_num] = df_job[job_num].replace('', 0)
df_job[job_num] = df_job[job_num].astype(int)

# get mean from SalaryMin and SalaryMax
df_job.SalaryMin = (df_job.SalaryMax + df_job.SalaryMin) // 2
df_job.rename(columns={'SalaryMin': 'SalaryMean'}, inplace=True)
df_job.SalaryMean = df_job.SalaryMean.apply(lambda x: 0 if x < 1_000_000 else x)
df_job.drop(columns=['SalaryMax'], inplace=True)

'''bool'''
df_job.UsingGlasses = df_job.UsingGlasses.astype(str).map(str.lower)



# load table for vocabulary
df_function.FunctionPositionName = df_function.FunctionPositionName.map(str.lower).apply(lambda x: function_replacement(x)).map(remove_insideparentheses).map(remove_morespace).map(str.strip)
df_education.EducationLevelName = df_education.EducationLevelName.map(str.lower)
df_city.CityName = df_city.CityName.map(str.lower)
df_province.ProvinceName = df_province.ProvinceName.map(str.lower)
df_major.MajorName = df_major.MajorName.map(str.lower)

In [414]:
df_job['Texts'] = df_job.JobTitlePosition + ' ' + df_job.DescriptionRequirement

In [417]:
df_applicant['Texts'] = df_applicant.Position + ' ' + df_applicant.DescriptionStrengthness

In [581]:
import gensim
import gensim.downloader as api
from gensim.models import doc2vec

# get dataset
# dataset = api.load("text8")
dataset = list(df_job.Texts.apply(lambda x: x.split()).values)
data =[]
for w in dataset:
    data.append(w)

# To train the model we need a list of tagged documents
def tagged_document(list_of_ListOfWords):
    for x, ListOfWords in enumerate(list_of_ListOfWords):
        yield doc2vec.TaggedDocument(ListOfWords, [x])

# training data
data_train = list(tagged_document(data))

In [None]:
# Initialize the model
d2v_model = doc2vec.Doc2Vec(vector_size=120, min_count=10, epochs=90)

# build the vocabulary
d2v_model.build_vocab(data_train)

# Train Doc2Vec model
d2v_model.train(data_train, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

In [617]:
d2v_model.similarity_unseen_docs(df_job.Texts.loc[27].split(), df_applicant.Texts.loc[15].split())

0.72404635

In [None]:
# d2v_model.save('data/d2c_model.model')
# loaded_model = Word2Vec.load('data/d2c_model.model')

In [623]:
import gensim.downloader as api
from multiprocessing import cpu_count
from gensim.models.word2vec import Word2Vec

# load the text8 dataset
dataset = api.load("text8")

# extract a list of words from the dataset
data =[]
for word in dataset:
    data.append(word)

# We will split the data into two parts
data_1 = data[:1200] # this is used to train the model
data_2 = data[1200:] # this part will be used to update the model

# Training the Word2Vec model
w2v_model = Word2Vec(data_1, min_count = 0, workers = cpu_count())

2023-04-14 15:20:26,761 : INFO : collecting all words and their counts
2023-04-14 15:20:26,762 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-14 15:20:28,188 : INFO : collected 207894 word types from a corpus of 12000000 raw words and 1200 sentences
2023-04-14 15:20:28,189 : INFO : Creating a fresh vocabulary
2023-04-14 15:20:28,729 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 207894 unique words (100.00% of original 207894, drops 0)', 'datetime': '2023-04-14T15:20:28.729935', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-14 15:20:28,729 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=0 leaves 12000000 word corpus (100.00% of original 12000000, drops 0)', 'datetime': '2023-04-14T15:20:28.729935', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-f

In [626]:
w2v_model.build_vocab(data_2, update=True)
w2v_model.train(data_2, total_examples= w2v_model.corpus_count, epochs=90)

2023-04-14 15:21:47,466 : INFO : collecting all words and their counts
2023-04-14 15:21:47,466 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-14 15:21:47,991 : INFO : collected 128511 word types from a corpus of 5005207 raw words and 501 sentences
2023-04-14 15:21:47,991 : INFO : Updating model with new vocabulary
2023-04-14 15:21:48,770 : INFO : Word2Vec lifecycle event {'msg': 'added 0 new unique words (0.00% of original 128511) and increased the count of 128511 pre-existing words (100.00% of original 128511)', 'datetime': '2023-04-14T15:21:48.770283', 'gensim': '4.3.1', 'python': '3.10.0 | packaged by conda-forge | (default, Nov 10 2021, 13:20:59) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-14 15:21:49,273 : INFO : deleting the raw counts dictionary of 128511 items
2023-04-14 15:21:49,275 : INFO : sample=0.001 downsamples 36 most-common words
2023-04-14 15:21:49,275 : INFO : Word2Vec lif

(336751507, 450468630)