In [1]:
import pickle
from tqdm import tqdm

# Import custom helper libraries
import os
import sys

src_path = os.path.abspath(os.path.join("../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

import data.helpers as data_helpers
import visualization.helpers as viz_helpers

# Maths modules
import pandas as pd

# Viz modules
import plotly.express as px

# Render for export
import plotly.io as pio

pio.renderers.default = "notebook"


In [2]:
# Download and unzip CSV files
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [3]:
# Load data from CSV
df = pd.read_csv(
    os.path.join(
        "..", "data", "raw", "training.1600000.processed.noemoticon.csv"
    ),
    names=["target", "id", "date", "flag", "user", "text"],
)

# Reduce memory usage
df = data_helpers.reduce_dataframe_memory_usage(df)

# Drop useless columns
df.drop(columns=["id", "date", "flag", "user"], inplace=True)

# Replace target values with labels
df.target.replace(
    {
        0: "NEGATIVE",
        2: "NEUTRAL",
        4: "POSITIVE",
    },
    inplace=True,
)

df.describe()


Unnamed: 0,target,text
count,1600000,1600000
unique,2,1581466
top,NEGATIVE,isPlayer Has Died! Sorry
freq,800000,210


In [4]:
# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenizers, Stemmers and Lemmatizers
import nltk
from nltk.corpus import stopwords
import spacy

# Download resources
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

# Download SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# Define tokenizer
tokenizer = lambda text: [  # SpaCy Lemmatizer
    token.lemma_.lower()
    for token in nlp(text)
    if token.is_alpha and not token.is_stop
]

# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(processed_data_path, "tfidf_spacy_dataset.pkl")
vocabulary_file_path = os.path.join(
    processed_data_path, "tfidf_spacy_vocabulary.pkl"
)

if not os.path.exists(vectorized_dataset_file_path) and not os.path.exists(
    vocabulary_file_path
):
    # Define vectorizer
    vectorizer = TfidfVectorizer(
        strip_accents="unicode",
        lowercase=True,
        stop_words=stopwords,
        tokenizer=tokenizer,
    )

    # Vectorize text
    X = vectorizer.fit_transform(df.text)

    # Get vocabulary
    vocabulary = vectorizer.get_feature_names_out()

    # Save vectorized dataset as pickle
    with open(vectorized_dataset_file_path, "wb") as f:
        pickle.dump(X, f)

    # Save vocabulary as pickle
    with open(vocabulary_file_path, "wb") as f:
        pickle.dump(vocabulary, f)


2022-01-16 09:18:05.941016: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-16 09:18:05.941042: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/clement/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer

# Tokenizers, Stemmers and Lemmatizers
import nltk
from nltk.corpus import stopwords
import spacy

# Download resources
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))

# Download SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

# Define tokenizer
tokenizer = lambda text: [  # SpaCy Lemmatizer
    token.lemma_.lower()
    for token in nlp(text)
    if token.is_alpha and not token.is_stop
]

# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(processed_data_path, "count_spacy_dataset.pkl")
vocabulary_file_path = os.path.join(
    processed_data_path, "count_spacy_vocabulary.pkl"
)

if not os.path.exists(vectorized_dataset_file_path) and not os.path.exists(
    vocabulary_file_path
):
    # Define vectorizer
    vectorizer = CountVectorizer(
        strip_accents="unicode",
        lowercase=True,
        stop_words=stopwords,
        tokenizer=tokenizer,
    )

    # Vectorize text
    X = vectorizer.fit_transform(df.text)

    # Get vocabulary
    vocabulary = vectorizer.get_feature_names_out()

    # Save vectorized dataset as pickle
    with open(vectorized_dataset_file_path, "wb") as f:
        pickle.dump(X, f)

    # Save vocabulary as pickle
    with open(vocabulary_file_path, "wb") as f:
        pickle.dump(vocabulary, f)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/clement/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from transformers import BertTokenizerFast


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
encoded_dataset_file_path = os.path.join(
    processed_data_path, "bert_encoded_dataset.pkl"
)

if not os.path.exists(encoded_dataset_file_path):
    bert = BertTokenizerFast.from_pretrained("bert-base-uncased")

    # Encode text
    X = [bert.encode(doc) for doc in tqdm(df.text)]

    # Save vectorized dataset as pickle
    with open(encoded_dataset_file_path, "wb") as f:
        pickle.dump(X, f)


In [7]:
from transformers import XLNetTokenizerFast


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
encoded_dataset_file_path = os.path.join(
    processed_data_path, "xlnet_encoded_dataset.pkl"
)

if not os.path.exists(encoded_dataset_file_path):
    xlnet = XLNetTokenizerFast.from_pretrained("xlnet-base-cased")

    # Encode text
    X = [xlnet.encode(doc) for doc in tqdm(df.text)]

    # Save vectorized dataset as pickle
    with open(encoded_dataset_file_path, "wb") as f:
        pickle.dump(X, f)


In [8]:
import spacy


# Processed data path
processed_data_path = os.path.join("..", "data", "processed")
vectorized_dataset_file_path = os.path.join(
    processed_data_path, "spacy_vectorized_dataset.pkl"
)

if not os.path.exists(vectorized_dataset_file_path):
    # Download SpaCy model
    try:
        nlp = spacy.load("en_core_web_lg")
    except:    
        !python -m spacy download en_core_web_lg
        nlp = spacy.load("en_core_web_lg")

    # Encode text
    X = [nlp(doc).vector for doc in tqdm(df.text)]

    # Save vectorized dataset as pickle
    with open(vectorized_dataset_file_path, "wb") as f:
        pickle.dump(X, f)
