In [2]:
from google.cloud import bigquery
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
# !pip install transformers==4.3.2
import torch
import io
import torch.nn.functional as F
import random
import numpy as np
import time
import math
import datetime
import torch.nn as nn
from typing import Union
from transformers import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install sentencepiece

##Set random values
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(seed_val)

#client = bigquery.Client()

In [15]:
# Load the filtered unlabeled comments and all the labeled comments.
fb_comments_unlabeled_df = pd.read_pickle("./unlabeled_filtered_comments.pkl")
fb_comments_labeled_df = pd.read_pickle("./labeled_comments.pkl")

print("You have ", len(fb_comments_unlabeled_df), " unlabeled comments")
print("You're using: ", len(fb_comments_labeled_df), " labeled comments.")
n_neg, n_pos = fb_comments_labeled_df["sentiment_label"].value_counts()[0], fb_comments_labeled_df["sentiment_label"].value_counts()[1]
print("Negative: %d%% (%d)" %(n_neg*100/len(fb_comments_labeled_df), n_neg))
print("Positive: %d%% (%d)" %(n_pos*100/len(fb_comments_labeled_df), n_pos))

You have  6794  unlabeled comments
You're using:  959  labeled comments.
Negative: 74% (710)
Positive: 25% (249)


In [16]:
# Select a percentage (of the unlabeled comments) for the labeled data. At this moment, we have 6794 comments in total.
             
def create_test_set(labeled_comments: pd.core.frame.DataFrame, test_size: Union[int, float]) -> tuple:
        print(labeled_comments.shape)
        if isinstance(test_size, int):
            test_size = test_size / labeled_comments.shape[0]
        sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state = 0)
        sss.get_n_splits(labeled_comments["message"].values, labeled_comments["sentiment_label"].values)
        train_labeled_data = None
        test_labeled_data = None
        for train_index, test_index in sss.split(labeled_comments["message"].values, labeled_comments["sentiment_label"].values):
            train_labeled_data = labeled_comments.iloc[train_index]
            test_labeled_data = labeled_comments.iloc[test_index]
        train_labeled_data.to_pickle("./train_labeled_data.pkl")
        test_labeled_data.to_pickle("./test_labeled_data.pkl")
        
        return train_labeled_data, test_labeled_data
        
test_size = 280
train_labeled_data, test_labeled_data = create_test_set(fb_comments_labeled_df, test_size)

(959, 5)


In [21]:
print("Test set.")
print(test_labeled_data.shape)
print("pos: %d%%, neg: %d%%" %(round(100*test_labeled_data["sentiment_label"].value_counts()[1]/test_labeled_data.shape[0]), 
                               round(100*test_labeled_data["sentiment_label"].value_counts()[0]/test_labeled_data.shape[0])))

Test set.
(280, 5)
pos: 26%, neg: 74%


In [22]:
print("Train set.")
print(train_labeled_data.shape)
print("pos: %d%%, neg: %d%%" %(round(100*train_labeled_data["sentiment_label"].value_counts()[1]/train_labeled_data.shape[0]), 
                               round(100*train_labeled_data["sentiment_label"].value_counts()[0]/train_labeled_data.shape[0])))

Train set.
(679, 5)
pos: 26%, neg: 74%


In [23]:
# Conserve this function just in case. Not being used at the moment.

In [None]:
def train_test_split(labeled_comments: pd.core.frame.DataFrame, unlabeled_comments: pd.core.frame.DataFrame,
                     test_size: Union[int, float], class_percentages: dict, new_split: bool = False, 
                     display_class_percentages: bool = False, test_file: str = "") -> tuple:
    df = labeled_comments
    df2 = unlabeled_comments
    
    if new_split:
        # Shuffle the data.
        df = df.sample(frac=1).reset_index(drop=True)

        if class_percentages["same_as_train"]:
            # Create the train-test split ensuring the class percentages are maintained.
            if isinstance(test_size, int):
                # Convert the test size expressed as no. of comments to a percentage of labeled comments.
                test_size = test_size / df.shape[0]        
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=0)
            sss.get_n_splits(df["message"].values, df["sentiment_label"].values)
            train_data = None
            test_data = None
            for train_index, test_index in sss.split(df["message"].values, df["sentiment_label"].values):
                train_data = df.iloc[train_index]
                test_data = df.iloc[test_index]
            test_neg = test_data["sentiment_label"].value_counts()[0]/test_data.shape[0]
            test_pos = test_data["sentiment_label"].value_counts()[1]/test_data.shape[0]
        else:
            # Create the test set ensuring the requested class percentages.
            test_neg = class_percentages["neg"]
            test_pos = class_percentages["pos"]
            if isinstance(test_size, float):
                # Convert the test size expressed as a percentage of labeled comments to a no. of comments.
                test_size = round(test_size * df.shape[0])
            neg_comments = df.loc[df["sentiment_label"] == "neg"].head(int(test_size * test_neg))
            pos_comments = df.loc[df["sentiment_label"] == "pos"].head(int(test_size * test_pos))
            test_data = pd.concat([neg_comments, pos_comments])
            # Create the train set with the remaining comments (class percentages are not controlled).
            train_data = df[~df.comment_id.isin(test_data.comment_id)]
        # Save the data.
        train_data.to_pickle("./train_data.pkl")
        test_data.to_pickle("./test_data_%d-%d_neg-pos.pkl" %(round(100*test_neg), round(100*test_neg)))
    else:
        test_file = "./test_data.pkl" if not test_file else "./" + test_file
        train_data = pd.read_pickle("./train_data.pkl")
        test_data = pd.read_pickle(test_file)
    
    # Make tuples with labeled data: (feature, label). 
    unlabeled_arr = np.array([(message, "UNK_UNK") for message in df2["message"].values])
    train_arr = np.array([(row["message"], row["sentiment_label"]) for _, row in train_data.iterrows()])
    test_arr = np.array([(row["message"], row["sentiment_label"]) for _, row in test_data.iterrows()])
    
    if display_class_percentages: 
        # Use code of previous versions (ugly).
        train_labeled_data = train_arr
        d_train = {"pos": 0, "neg": 0, "UNK_UNK": 0}
        d_test = d_train.copy()
        for _, label in train_labeled_data:
            d_train[label] += 1
        print("Percentages in train: ")
        print("Negative: %d%% (%d)" %(round(d_train["neg"]*100/len(train_labeled_data)), d_train["neg"]))
        print("Positive:  %d%% (%d)" %(round(d_train["pos"]*100/len(train_labeled_data)), d_train["pos"]))
        print("")

        test_labeled_data = test_arr
        for _, label in test_labeled_data:
            d_test[label] += 1
        print("Percentages in test: ")
        print("Negative: %d%%" %(round(d_test["neg"]*100/len(test_labeled_data))))
        print("Positive:  %d%%" %(round(d_test["pos"]*100/len(test_labeled_data))))
        
    # Create mask arrays.
    unlabeled_masks = np.zeros(unlabeled_arr.shape[0], dtype=bool)
    train_masks = np.ones(train_arr.shape[0], dtype=bool)
    test_masks = np.ones(test_arr.shape[0], dtype=bool)
    
    # Extend the train data with the unlabeled data.
    train_arr = np.append(train_arr, unlabeled_arr, axis=0)
    train_masks = np.concatenate((train_masks, unlabeled_masks))

    
    return train_arr, train_masks, test_arr, test_masks

new_split = True
test_size =              # can be int (number of comments in test set) or float (percentage of train data to use as test)
class_percentages = {"pos": .5, "neg": .5, "same_as_train": False}   # set to True if you want to maintain class percentages
display = True
test_file = "test_data_50-50_neg-pos.pkl"                           # set new_split to False first


train_examples, train_label_masks, test_examples, test_label_masks = train_test_split(fb_comments_labeled_df, 
                                                                          fb_comments_unlabeled_df,
                                                                          test_size,
                                                                          class_percentages=class_percentages,
                                                                          new_split=new_split,
                                                                          display_class_percentages=display,
                                                                          test_file = test_file)