## Input CSV to Pandas

In [1]:
import pandas as pd
import numpy as np

In [4]:
dataTrain = pd.read_csv("clean_tweet_training.csv", index_col=False)
dataTest = pd.read_csv("clean_tweet_test.csv", index_col=False)

In [7]:
dataTrain = dataTrain.dropna()
dataTest = dataTest.dropna()
dataTrain.info()
dataTest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29702 entries, 0 to 29701
Data columns (total 3 columns):
Unnamed: 0    29702 non-null int64
Text          29702 non-null object
Verdict       29702 non-null int64
dtypes: int64(2), object(1)
memory usage: 928.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20631 entries, 0 to 20631
Data columns (total 3 columns):
Unnamed: 0    20631 non-null int64
Text          20631 non-null object
Verdict       20631 non-null int64
dtypes: int64(2), object(1)
memory usage: 644.7+ KB


## Training Doc2Vec Model

In [9]:
!pip install -U gensim
import multiprocessing
from gensim.models import Doc2Vec
import gensim.models.doc2vec
cores = multiprocessing.cpu_count()

assert gensim.models.doc2vec.FAST_VERSION > -1

Requirement already up-to-date: gensim in /home/gemilang/.local/lib/python3.7/site-packages (3.6.0)


### Prepare tagged data (taggedDocuments from gensim)

In [27]:
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import ToktokTokenizer
toktok = ToktokTokenizer().tokenize

In [31]:
def GetTaggedData(data):
    taggedData = []
    for row in data.itertuples(index=True,name="Pandas"):
      text = row.Text  
      words = toktok(text)    
      verdict = getattr(row,"Verdict")
      tags = [str(verdict), str(row.Index)]
      taggedData.append(TaggedDocument(words,tags))  
    return taggedData

In [32]:
taggedDataTrain = GetTaggedData(dataTrain)
taggedDataTest = GetTaggedData(dataTest)

### Initialize model

In [19]:
%%time
model = Doc2Vec(dm=1, alpha = 0.02, min_alpha = 0.0025, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=1, sample=0, epochs=20, workers=cores)
model.build_vocab(taggedDataTrain)

CPU times: user 754 ms, sys: 0 ns, total: 754 ms
Wall time: 753 ms


### Train the model

In [20]:
%%time
MAX_ITERATION = 10
for epoch in range(MAX_ITERATION):
  print("Now training epoch",epoch+1)
  model.train(taggedDataTrain, total_examples = model.corpus_count, epochs = model.epochs )
  model.alpha-= 0.0002
  model.min_alpha=model.alpha

Now training epoch 1
Now training epoch 2
Now training epoch 3
Now training epoch 4
Now training epoch 5
Now training epoch 6
Now training epoch 7
Now training epoch 8
Now training epoch 9
Now training epoch 10
Now training epoch 11
Now training epoch 12
Now training epoch 13
Now training epoch 14
Now training epoch 15
Now training epoch 16
Now training epoch 17
Now training epoch 18
Now training epoch 19
Now training epoch 20
Now training epoch 21
Now training epoch 22
Now training epoch 23
Now training epoch 24
Now training epoch 25
Now training epoch 26
Now training epoch 27
Now training epoch 28
Now training epoch 29
Now training epoch 30
Now training epoch 31
Now training epoch 32
Now training epoch 33
Now training epoch 34
Now training epoch 35
Now training epoch 36
Now training epoch 37
Now training epoch 38
Now training epoch 39
Now training epoch 40
CPU times: user 3min 1s, sys: 1.48 s, total: 3min 3s
Wall time: 3min 1s


### saving the doc2vec model

In [21]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("doc2vec_model1")
model.save(fname)

## Using Logistic Regression to Predict the dataTest

### Prearing feature vector for the classifier

In [38]:
def GetLearningVec(model, taggedData):
    X_data, Y_data = [], []
    for data in taggedData:
        words = data[0]
        verdict = data[1][0]
        vector = model.infer_vector(words)
        X_data.append(vector)
        Y_data.append(float(verdict))
    return X_data,Y_data

In [39]:
%%time
X_dataTrain, Y_dataTrain = GetLearningVec(model, taggedDataTrain)
X_dataTest, Y_dataTest = GetLearningVec(model, taggedDataTest)

### Training the logistic regression classifier

In [40]:
%%time
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_dataTrain, Y_dataTrain)
Y_prediction = logreg.predict(X_dataTest)



CPU times: user 786 ms, sys: 22.4 ms, total: 808 ms
Wall time: 969 ms


### Check Fitness

In [41]:
from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(Y_dataTest, Y_prediction))
print('Testing F1 score: {}'.format(f1_score(Y_dataTest, Y_prediction, average='weighted')))

Testing accuracy 0.4025010905918278
Testing F1 score: 0.36535988288264754


  'precision', 'predicted', average, warn_for)
