### Exploring the dataset to develop human-coded dictionaries
Here I used word2vec to further develop my human-coded dictionaries for analysis of the topics discussed in articles.

In [None]:
import gensim.downloader
import pandas as pd
import numpy as np
import torch
import random
from nltk.tokenize import word_tokenize
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
import datasets
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
import nltk

# enabling inline plots in Jupyter
%matplotlib inline
datasets.logging.set_verbosity_error()

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collectin

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# solution
class MyDataLoader(object):
    """
    A DataLoader class for reading and iterating over a corpus file.

    Args:
        filename (str): The name of the corpus file.
    """
    # initialize the corpus object for a given filename
    def __init__(self, filename):
        self.corpus = filename

    # we will need to define what counts as a "chunk" in this file, so when the
    # Dataloader is loading (iterating over) the file and feeding it to the embedding
    # model, it knows what to treat as one unit. Here, we (arbitrarily) say that one
    # line in the file (corresponding to a paragraph) is one chunk.

    def __iter__(self):
    # _iter_ator function to iterate over the lines of the corpus file.
        for line in open(self.corpus, "r", encoding="utf-8"):
            # checking that the line is not empty:
            if line.strip():
            # you may do some pre-processing on-the-fly. here we tokenize and lowercase
            # the string before yielding it
                line = word_tokenize(line)
                line = [x.lower() for x in line]
                yield line

In [None]:
liwc_analysis_men = pd.read_csv('/content/drive/MyDrive/SDS/Thesis/Data/liwc_analysis_men.csv')

In [None]:
liwc_analysis_women = pd.read_csv('/content/drive/MyDrive/SDS/Thesis/Data/liwc_analysis_women.csv')

In [None]:
# Ensure nltk's tokenizer resources are available
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Sample text data
men_text_raw = [str(text) for text in liwc_analysis_men['article_text'] if text is not None]
women_text_raw = [str(text) for text in liwc_analysis_women['article_text'] if text is not None]

# Function to split text into sentences and tokenize each sentence
def process_text(raw_text):
    sentences = []
    for text in raw_text:
        # Split text into sentences based on '.', '!', or '?' delimiters
        split_sentences = re.split(r'[.!?]', text)

        # Tokenize each sentence using word_tokenize (and keep stopwords)
        tokenized_sentences = []
        for sentence in split_sentences:
            # Use nltk's word_tokenize and convert to lowercase
            tokens = [word.lower() for word in word_tokenize(sentence) if word.isalpha()]  # Keep only alphabetic tokens
            # Append the tokenized sentence if it's not empty
            if tokens:
                tokenized_sentences.append(tokens)

        # Append each processed sentence to the sentences list
        sentences.extend(tokenized_sentences)
    return sentences

# Process both men and women texts
men_text = process_text(men_text_raw)
women_text = process_text(women_text_raw)

# Example output
print("Tokenized men text:", men_text[:5])  # Display first 5 tokenized sentences
print("Tokenized women text:", women_text[:5])


Tokenized men text: [['with', 'all', 'eyes', 'on', 'diego', 'costa', 'and', 'where', 'he', 'stamps', 'his', 'feet', 'following', 'chelsea', 's', 'fractious', 'capital', 'cup', 'win', 'over', 'liverpool', 'the', 'other', 'striker', 'who', 'left', 'an', 'unfortunate', 'mark', 'at', 'stamford', 'bridge', 'was', 'relegated', 'to', 'a', 'footnote'], ['fitting', 'for', 'that', 'is', 'how', 'mario', 'balotelli', 's', 'liverpool', 'career', 'is', 'playing', 'out'], ['sky', 's', 'commentary', 'team', 'heralded', 'balotelli', 's', 'introduction', 'for', 'lazar', 'markovic', 'as', 'a', 'chance', 'to', 'put', 'one', 'over', 'the', 'former', 'internazionale', 'coach', 'who', 'labelled', 'him', 'unmanageable', 'josé', 'mourinho'], ['ultimately', 'his', 'contribution', 'amounted', 'to', 'no', 'more', 'than', 'weakening', 'a', 'previously', 'impressive', 'dangerous', 'liverpool', 'performance', 'and', 'unwittingly', 'assisting', 'in', 'chelsea', 's', 'winning', 'goal'], ['not', 'all', 'substitutions',

In [None]:
# training the Skip-Gram - model
men2vec = gensim.models.Word2Vec(
    men_text,   # the larger corpus object we've loaded
    vector_size=300,     # the dimensionality of the target vectors
    window=3,     # window ngram size
    min_count=5,  # ignoring low-frequency words
    epochs=3,      # how many training passes to have
    sg = 1)       # 1 for skip-gram model, 0 for cbow


In [None]:
# training the Skip-Gram - model
women2vec = gensim.models.Word2Vec(
    women_text,   # the larger corpus object we've loaded
    vector_size=300,     # the dimensionality of the target vectors
    window=3,     # window ngram size
    min_count=5,  # ignoring low-frequency words
    epochs=3,      # how many training passes to have
    sg = 1)       # 1 for skip-gram model, 0 for cbow


In [None]:
# load in men2vec model
men2vec = gensim.models.Word2Vec.load('/content/drive/MyDrive/SDS/Thesis/Data/men2vec.model')

In [None]:
# load in women2vec model
women2vec = gensim.models.Word2Vec.load('/content/drive/MyDrive/SDS/Thesis/Data/women2vec.model')

In [None]:
for word_of_interest in ['tournament', 'championship', 'finals']:
    use_count = men2vec.wv.get_vecattr(word_of_interest, "count")
    print(f"'{word_of_interest}' was mentioned {use_count} time(s) in the corpus")
    print(f"The words most similar to {word_of_interest} are:")
    print(men2vec.wv.most_similar(word_of_interest))

'tournament' was mentioned 14939 time(s) in the corpus
The words most similar to tournament are:
[('tournaments', 0.6567858457565308), ('euros', 0.6496333479881287), ('confederations', 0.6176959276199341), ('euro', 0.6136319637298584), ('wc', 0.6122615337371826), ('world', 0.6076704263687134), ('jamboree', 0.5851637721061707), ('competition', 0.58104008436203), ('olympics', 0.5578311681747437), ('toulon', 0.5536025166511536)]
'championship' was mentioned 16327 time(s) in the corpus
The words most similar to championship are:
[('spl', 0.600573718547821), ('premiership', 0.5939382910728455), ('tykes', 0.5684801936149597), ('prem', 0.5678666234016418), ('jpt', 0.5631271600723267), ('ipswich', 0.5630439519882202), ('tier', 0.5626885294914246), ('middlesbrough', 0.5596318244934082), ('premier', 0.5564159154891968), ('league', 0.5554913282394409)]
'finals' was mentioned 4319 time(s) in the corpus
The words most similar to finals are:
[('championships', 0.685486912727356), ('tournaments', 0.6

In [None]:
for word_of_interest in ['tournament', 'championship', 'finals']:
    use_count = women2vec.wv.get_vecattr(word_of_interest, "count")
    print(f"'{word_of_interest}' was mentioned {use_count} time(s) in the corpus")
    print(f"The words most similar to {word_of_interest} are:")
    print(women2vec.wv.most_similar(word_of_interest))

'tournament' was mentioned 3464 time(s) in the corpus
The words most similar to tournament are:
[('euros', 0.7444405555725098), ('euro', 0.7153774499893188), ('competition', 0.7028447985649109), ('world', 0.6778019070625305), ('tokyo', 0.6638725996017456), ('olympics', 0.662897527217865), ('tournaments', 0.6528619527816772), ('milestone', 0.6435577273368835), ('shebelieves', 0.6323204636573792), ('earliest', 0.6282159090042114)]
'championship' was mentioned 865 time(s) in the corpus
The words most similar to championship are:
[('championships', 0.8226462602615356), ('promotion', 0.7910805940628052), ('maiden', 0.7849583625793457), ('undefeated', 0.7748493552207947), ('inaugural', 0.7733708024024963), ('summit', 0.7721524834632874), ('destination', 0.7640843391418457), ('premiership', 0.7626152634620667), ('rescheduled', 0.7624508142471313), ('showpiece', 0.759446382522583)]
'finals' was mentioned 390 time(s) in the corpus
The words most similar to finals are:
[('qualifiers', 0.86307781

In [None]:
# save model
men2vec.save('/content/drive/MyDrive/SDS/Thesis/Data/men2vec.model')

In [None]:
# save women model
women2vec.save('/content/drive/MyDrive/SDS/Thesis/Data/women2vec.model')