In [60]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import gensim
import io
import time
from datetime import timedelta
import progressbar

In [64]:
csv = 'lib/dataset'
my_df = pd.read_csv(csv,index_col=0)
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()
x=my_df.text

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3381 entries, 0 to 3380
Data columns (total 2 columns):
text      3381 non-null object
target    3381 non-null int64
dtypes: int64(1), object(1)
memory usage: 52.9+ KB


In [65]:
csv = 'lib/dataset ahok'
df_wiki = pd.read_csv(csv,index_col=0)
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 2 columns):
target    4000 non-null int64
text      4000 non-null object
dtypes: int64(1), object(1)
memory usage: 93.8+ KB


In [66]:
csv = 'lib/datasetfm'
df2 = pd.read_csv(csv,index_col=0)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10803 entries, 0 to 10802
Data columns (total 2 columns):
text      10801 non-null object
target    10803 non-null int64
dtypes: int64(1), object(1)
memory usage: 253.2+ KB


In [67]:
my_df=my_df.drop(['target'], axis=1)
df_wiki=df_wiki.drop(['target'], axis=1)
df2=df2.drop(['target'], axis=1)
my_df=my_df.append([df_wiki,df2])
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

x = my_df.text

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18182 entries, 0 to 18181
Data columns (total 1 columns):
text    18182 non-null object
dtypes: object(1)
memory usage: 142.1+ KB


In [68]:
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, x, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [69]:
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [70]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

In [71]:
len(all_x_w2v)

18182

In [72]:
cores=multiprocessing.cpu_count()
print(cores)

16


In [73]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

# DBOW

In [74]:
model_ug_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 18182/18182 [00:00<00:00, 2617858.48it/s]


In [75]:
%%time
for epoch in tqdm(range(30)):
    model_ug_dbow.train(utils.shuffle([x for x in all_x_w2v]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|██████████| 30/30 [00:28<00:00,  1.17s/it]

CPU times: user 31 s, sys: 4.26 s, total: 35.2 s
Wall time: 28.6 s





In [76]:
model_ug_dbow.most_similar('baik')

  """Entry point for launching an IPython kernel.


[('ih', 0.34407955408096313),
 ('didapat', 0.34024515748023987),
 ('binasa', 0.3342020809650421),
 ('pah', 0.3289330005645752),
 ('thing', 0.31657981872558594),
 ('pale', 0.3129824697971344),
 ('paradigma', 0.311463326215744),
 ('iseng', 0.309376060962677),
 ('iii', 0.3054810166358948),
 ('dorang', 0.30530425906181335)]

In [77]:
"""
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)
clf.score(validation_vecs_dbow, y_validation)
"""

'\ntrain_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)\nvalidation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)\nclf = LogisticRegression()\nclf.fit(train_vecs_dbow, y_train)\nclf.score(validation_vecs_dbow, y_validation)\n'

In [78]:
model_ug_dbow.save('lib/d2v_dbow_indo')
model_ug_dbow = Doc2Vec.load('lib/d2v_dbow_indo')
model_ug_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
print('saved')

saved


# DMC

In [79]:
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 18182/18182 [00:00<00:00, 2187255.07it/s]


In [80]:
%%time
for epoch in tqdm(range(30)):
    model_ug_dmc.train(utils.shuffle([x for x in all_x_w2v]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|██████████| 30/30 [00:25<00:00,  1.22it/s]

CPU times: user 38.4 s, sys: 5.35 s, total: 43.8 s
Wall time: 25.6 s





In [81]:
model_ug_dmc.most_similar('bagus')

  """Entry point for launching an IPython kernel.


[('kreatif', 0.4769588112831116),
 ('aya', 0.455912709236145),
 ('xixixi', 0.4359319806098938),
 ('rnr', 0.4343729615211487),
 ('wehebabusahkan', 0.43410077691078186),
 ('nyelip', 0.4293331503868103),
 ('pansos', 0.41933026909828186),
 ('pakutopo', 0.41920655965805054),
 ('gapernah', 0.4191807210445404),
 ('mrk', 0.41541165113449097)]

In [82]:
"""
train_vecs_dmc = get_vectors(model_ug_dmc, x_train, 100)
validation_vecs_dmc = get_vectors(model_ug_dmc, x_validation, 100)
clf = LogisticRegression()
clf.fit(train_vecs_dmc, y_train)
clf.score(validation_vecs_dmc, y_validation)
"""

'\ntrain_vecs_dmc = get_vectors(model_ug_dmc, x_train, 100)\nvalidation_vecs_dmc = get_vectors(model_ug_dmc, x_validation, 100)\nclf = LogisticRegression()\nclf.fit(train_vecs_dmc, y_train)\nclf.score(validation_vecs_dmc, y_validation)\n'

In [83]:
model_ug_dmc.save('lib/d2v_dmc_indo')
model_ug_dmc = Doc2Vec.load('lib/d2v_dmc_indo')
print('saved')

saved


# DMM

In [84]:
model_ug_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmm.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 18182/18182 [00:00<00:00, 2808663.65it/s]


In [85]:
%%time
for epoch in tqdm(range(30)):
    model_ug_dmm.train(utils.shuffle([x for x in all_x_w2v]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmm.alpha -= 0.002
    model_ug_dmm.min_alpha = model_ug_dmm.alpha

100%|██████████| 30/30 [00:52<00:00,  1.76s/it]

CPU times: user 49.3 s, sys: 20.3 s, total: 1min 9s
Wall time: 52.5 s





In [86]:
model_ug_dmm.most_similar('bagus')

  """Entry point for launching an IPython kernel.


[('beneran', 0.4241843819618225),
 ('serbet', 0.39719846844673157),
 ('bumn', 0.3946438133716583),
 ('slow', 0.39137014746665955),
 ('pertamina', 0.3892749547958374),
 ('jkw', 0.3874319791793823),
 ('hindar', 0.38322871923446655),
 ('adem', 0.3744054436683655),
 ('ser', 0.3720950782299042),
 ('leadership', 0.36461758613586426)]

In [87]:
"""
train_vecs_dmm = get_vectors(model_ug_dmm, x_train, 100)
validation_vecs_dmm = get_vectors(model_ug_dmm, x_validation, 100)
clf = LogisticRegression()
clf.fit(train_vecs_dmm, y_train)
clf.score(validation_vecs_dmm, y_validation)
"""

'\ntrain_vecs_dmm = get_vectors(model_ug_dmm, x_train, 100)\nvalidation_vecs_dmm = get_vectors(model_ug_dmm, x_validation, 100)\nclf = LogisticRegression()\nclf.fit(train_vecs_dmm, y_train)\nclf.score(validation_vecs_dmm, y_validation)\n'

In [88]:
model_ug_dmm.save('lib/d2v_dmm_indo')
model_ug_dmm = Doc2Vec.load('lib/d2v_dmm_indo')
model_ug_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
print('saved')

saved
