<a href="https://colab.research.google.com/github/yanaa11/media-opinion-analyzer/blob/main/train_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import random
import time

import pandas as pd
import numpy as np

import gensim
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data preparation

### Upload data

In [None]:
os.listdir("/content/drive/MyDrive/data_reddit_clean")

['JoeBiden_comments_2020_clean.csv', 'Trump_comments_2020_clean.csv']

In [3]:
#comments_dir = "Lecture - Text embeddings (Medvedev)/codes/comments"
comments_dir = "/content/drive/MyDrive/data_reddit_clean"

In [14]:
# TRUMP COMMENTS
trump_comments_file = "./Trump_comments_2020_clean.csv"
trump_comments_path = os.path.join(comments_dir, trump_comments_file)
trump_df = pd.read_csv(trump_comments_path, index_col = 0)


# BIDEN COMMENTS
biden_comments_file = './JoeBiden_comments_2020_clean.csv'
biden_comments_path = os.path.join(comments_dir, biden_comments_file)
biden_df = pd.read_csv(biden_comments_path, index_col = 0)


In [15]:
trump_df['who'] = 0
biden_df['who'] = 1

In [6]:
all_df = pd.concat([trump_df, biden_df])
all_df = all_df.reset_index(drop=True)
all_df = all_df.drop(['author', 'created_utc', 'link_id', 'parent_id', 'score'], axis=1)

In [7]:
len(trump_df), len(biden_df)

(618858, 499528)

In [16]:
n_entries = 50000 #entries from initial dataframes from each candidate for training
n_test = 10000 #entries from initial dataframes from each candidate for test

s_trump_df_train = trump_df[:n_entries]
s_trump_df_train['who'] = 0 # 0 == trump
s_biden_df_train = biden_df[:n_entries]
s_biden_df_train['who'] = 1 # 1 == biden
s_all_df_train = pd.concat([s_trump_df_train, s_biden_df_train])
s_all_df_train = s_all_df_train.reset_index(drop=True)

s_trump_df_test = trump_df[n_entries:n_entries+n_test]
s_trump_df_test['who'] = 0 # 0 == trump
s_biden_df_test = biden_df[n_entries:n_entries+n_test]
s_biden_df_test['who'] = 1 # 1 == biden
s_all_df_test = pd.concat([s_trump_df_test, s_biden_df_test])
s_all_df_test = s_all_df_test.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https:

In [17]:
del(trump_df)
del(biden_df)

### Tokenize

In [9]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags       # strip html tags
from gensim.parsing.preprocessing import strip_short      
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation, strip_non_alphanum

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
CUSTOM_FILTERS = [lambda x: x.lower(), strip_non_alphanum, strip_punctuation, strip_short, 
                  strip_tags, remove_stopwords]

preprocess_string(all_df['body'][0], CUSTOM_FILTERS)

['fighters',
 'means',
 'warrior',
 'like',
 'people',
 'participate',
 'war',
 'fuck',
 'defending',
 'enemy']

In [11]:
all_df['body_tokenized'] = [preprocess_string(all_df['body'][i], CUSTOM_FILTERS) for i in range(len(all_df))]

In [19]:
s_all_df_train['body_tokenized'] = [preprocess_string(s_all_df_train['body'][i], CUSTOM_FILTERS) for i in range(len(s_all_df_train))]
s_all_df_test['body_tokenized'] = [preprocess_string(s_all_df_test['body'][i], CUSTOM_FILTERS) for i in range(len(s_all_df_test))]

In [21]:
s_all_df_test = s_all_df_test.drop(['author', 'created_utc', 'link_id', 'parent_id', 'score'], axis=1)
s_all_df_train = s_all_df_train.drop(['author', 'created_utc', 'link_id', 'parent_id', 'score'], axis=1)

# Vectorization

In [27]:
tagged_train_data = [TaggedDocument(words=d, tags=[str(i)]) for i, d in enumerate(s_all_df_train['body_tokenized'].to_list())]

Train 2 models: 

**PV-DM:** Distributed Memory.
When training a vector representing the paragraph is fed to the network. It acts a memory of what is missing from the current context, the window of words around the trained word represents the context of a word.

**PV-DBOW:** Distributed bag of words.
The paragraph vector is directly fed into a classifier and predictions of the words present in the paragraph is made based on the vector.

In [28]:
TRAIN_MODELS = True

# DM
if TRAIN_MODELS:
    model_dm = Doc2Vec(vector_size=512,
                    window=5, 
                    alpha=.025, 
                    min_alpha=0.00025, 
                    min_count=2, 
                    dm=1)
    model_dm.build_vocab(tagged_train_data)

    for epoch in tqdm(range(10)):
        print(f'Epoch {epoch}')
        model_dm.train(tagged_train_data,
                    total_examples=model_dm.corpus_count,
                    epochs=model_dm.epochs)
        
        print(model_dm.running_training_loss)
        # decrease the learning rate
        model_dm.alpha -= 0.00025
        # fix the learning rate, no decay
        model_dm.min_alpha = model_dm.alpha

    model_dm.save('/content/drive/MyDrive/cdm_model.d2v')
else:
    model_dm = Doc2Vec.load('/content/drive/MyDrive/dm_model.d2v')

# DBOW
if TRAIN_MODELS:
    model_dbow = Doc2Vec(vector_size=512,
                    window=5, 
                    alpha=.025, 
                    min_alpha=0.00025, 
                    min_count=2, 
                    dm=0)
    model_dbow.build_vocab(tagged_train_data)

    for epoch in tqdm(range(10)):
        print(f'Epoch {epoch}')
        model_dbow.train(tagged_train_data,
                    total_examples=model_dbow.corpus_count,
                    epochs=model_dbow.epochs)
        
        print(model_dbow.running_training_loss)
        # decrease the learning rate
        model_dbow.alpha -= 0.00025
        # fix the learning rate, no decay
        model_dbow.min_alpha = model_dbow.alpha

    model_dbow.save('/content/drive/MyDrive/dbow_model.d2v')
else:
    model_dbow = Doc2Vec.load('/content/drive/MyDrive/dbow_model.d2v')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch 0
0.0
Epoch 1
0.0
Epoch 2
0.0
Epoch 3
0.0
Epoch 4
0.0
Epoch 5
0.0
Epoch 6
0.0
Epoch 7
0.0
Epoch 8
0.0
Epoch 9
0.0



HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Epoch 0
0.0
Epoch 1
0.0
Epoch 2
0.0
Epoch 3
0.0
Epoch 4
0.0
Epoch 5
0.0
Epoch 6
0.0
Epoch 7
0.0
Epoch 8
0.0
Epoch 9
0.0



Combining both models

In [30]:
!pip install testfixtures

Collecting testfixtures
[?25l  Downloading https://files.pythonhosted.org/packages/9c/93/08cbd1203cd490ac789f42a9594540d1b6c4d8fca6c2d06296b284518052/testfixtures-6.17.1-py2.py3-none-any.whl (95kB)
[K     |███▌                            | 10kB 11.8MB/s eta 0:00:01[K     |███████                         | 20kB 17.1MB/s eta 0:00:01[K     |██████████▍                     | 30kB 13.1MB/s eta 0:00:01[K     |█████████████▉                  | 40kB 10.0MB/s eta 0:00:01[K     |█████████████████▎              | 51kB 6.0MB/s eta 0:00:01[K     |████████████████████▊           | 61kB 6.8MB/s eta 0:00:01[K     |████████████████████████▏       | 71kB 7.1MB/s eta 0:00:01[K     |███████████████████████████▋    | 81kB 7.7MB/s eta 0:00:01[K     |███████████████████████████████ | 92kB 8.5MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 4.9MB/s 
[?25hInstalling collected packages: testfixtures
Successfully installed testfixtures-6.17.1


In [32]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

model_dbow.delete_temporary_training_data(
    keep_doctags_vectors=True,
    keep_inference=True)

model_dm.delete_temporary_training_data(
    keep_doctags_vectors=True,
    keep_inference=True)

new_model = ConcatenatedDoc2Vec([model_dbow, model_dm])

#new_model.save('/content/drive/MyDrive/new_model.d2v')

Infer vectors

In [33]:
s_all_df_train["vec"] = s_all_df_train["body_tokenized"].apply(new_model.infer_vector)
s_all_df_test["vec"] = s_all_df_test["body_tokenized"].apply(new_model.infer_vector)

#all_df["vec"] = all_df["body_tokenized"].apply(new_model.infer_vector)

In [34]:
#all_df.to_csv("vectorized_data_d2v.csv")

s_all_df_train.to_csv("/content/drive/MyDrive/vectorized_train_d2v.csv")
s_all_df_test.to_csv("/content/drive/MyDrive/vectorized_test_d2v.csv")