## Required Libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from nltk.corpus import stopwords
import string
from unidecode import unidecode
import random
import itertools
import csv

## Import Data

In [3]:
PATH = '../data/'

content = pd.read_csv(f"{PATH}content_filtered.csv")
correlations = pd.read_csv(f"{PATH}correlations.csv")
#sample_submission = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv")
topics = pd.read_csv(f"{PATH}topics_filtered.csv")

## Data Preparation

In [4]:
def combine(correlations, topics, content):
    '''
    - Inputs our three datasets and combines the topic/content information with the topic/content correlations data. 
    - All topic/content information is concatenated to one "features" column, which includes the language, title, description, etc.
    - Output includes the correlations topics information, correlations content information, and a dictionary to convert indices to their
      corresponding topic/content id. 
    '''
    #Drop/combine columns
    content["text"] = content["text"].fillna('')
    content = content.dropna()
    content_combined = content["language"] + " " + content["title"] + " " + content["description"] + " " + content["text"]
    content_combined = pd.DataFrame({"id":content["id"], "features":content_combined})
    print("content_combined", content_combined.shape)

    topics["description"] = topics["description"].fillna('')
    topics = topics.dropna()
    topics_combined = topics["language"] + " " + topics["channel"] + ' ' + topics["title"] + " " + topics["description"]
    topics_combined = pd.DataFrame({"id":topics["id"], "features":topics_combined})
    print("topics_combined", topics_combined.shape)
    
    #Explode correlations rows
    correlations["content_ids"] = correlations["content_ids"].str.split()
    correlations = correlations.explode("content_ids")

    #Merge
    merged = correlations.merge(topics_combined, how="inner", left_on="topic_id", right_on="id")
    print("merged", merged.shape)
    merged = merged.reset_index().merge(content_combined, how="inner", left_on="content_ids", right_on="id", sort=False, suffixes=("_topics", "_content")).sort_values(axis=0, by="index")
    merged = merged.drop(["content_ids", "topic_id"], axis=1)
    print("merged", merged.shape)

    #Split
    corr_topics = merged[['index', 'features_topics']]
    corr_topics.columns = ['id', 'features']
    corr_content = merged[['index', 'features_content']]
    corr_content.columns = ['id', 'features']

    index_to_topic = pd.Series(merged.id_topics.values, index=merged.index).to_dict()
    index_to_content = pd.Series(merged.id_content.values, index=merged.index).to_dict()

    return corr_topics, corr_content, index_to_topic, index_to_content

#### Apply combine() to our data
corr_topics, corr_content, index_to_topic, index_to_content = combine(correlations, topics, content)



content_combined (16906, 2)
topics_combined (36078, 2)
merged (127725, 4)
merged (47435, 5)


In [9]:
lang_dict = {"en":"english"}

# List of languages supported by the natural language tool kit (NLTK) module.
supported_languages = stopwords.fileids()

def remove_stopwords(text):
    '''
    Checks language of text then removes stopwords from that language if supported.
    '''
    lang_code = text[0:2]
    if lang_dict[lang_code] in supported_languages:
        for word in stopwords.words(lang_dict[lang_code]):
            text = text.replace(' ' + word + ' ', ' ')
    return text

In [10]:
corr_topics["features"] = corr_topics.features.apply(remove_stopwords)
corr_content["features"] = corr_content.features.apply(remove_stopwords)

## Create Training and Testing sets

In [11]:
# Training
random.seed(10)
train_indices = random.sample(range(len(corr_content)), round(0.8*len(corr_content))) #80/20 train/test split

half = round(len(train_indices) / 2)
full = len(train_indices)

train_topics_half = corr_topics.iloc[train_indices[:half], :]
train_content_half = corr_content.iloc[train_indices[:half], :]

train_topics_full = corr_topics.iloc[train_indices[half:(full-20)], :] 
train_content_full = corr_content.iloc[train_indices[(half+20):(full)], :] 

train_topics = pd.concat([train_topics_half, train_topics_full]).reset_index().drop("index", axis=1)
train_content = pd.concat([train_content_half, train_content_full]).reset_index().drop("index", axis=1)

# Testing
test_topics = corr_topics.drop(train_indices, axis=0)
test_content = corr_content.drop(train_indices, axis=0)

half = round(len(test_topics.features) / 2)
full = len(test_topics.features)

test_topics_half = test_topics.iloc[:half, :]
test_content_half = test_content.iloc[:half, :]

test_topics_full = test_topics.iloc[half:(full - 5), :]
test_content_full = test_content.iloc[(half+5):full, :]

test_topics = pd.concat([test_topics_half, test_topics_full]).reset_index().drop("index", axis=1)
test_content = pd.concat([test_content_half, test_content_full]).reset_index().drop("index", axis=1)

# Create Train Labels
train_labels = np.array((train_topics.id == train_content.id).astype(int))
test_labels = np.array((test_topics.id == test_content.id).astype(int))

## Conversion to Tensors

In [14]:
train_topics = torch.utils.data.TensorDataset(torch.as_tensor(train_topics.features, dtype=torch.string))
train_topics = torch.utils.data.Dataset.from_tensor_slices(torch.as_tensor(train_content.features, dtype=torch.string))
train_topics = torch.utils.data.Dataset.from_tensor_slices(torch.as_tensor(train_labels.features, dtype=torch.string))
# train_topics = tf.data.Dataset.from_tensor_slices(tf.cast(train_topics.features, tf.string))
# train_content = tf.data.Dataset.from_tensor_slices(tf.cast(train_content.features, tf.string))
# train_labels = tf.data.Dataset.from_tensor_slices(tf.cast(train_labels, tf.int32))

test_topics = torch.utils.data.Dataset.from_tensor_slices(torch.as_tensor(test_topics.features, dtype=torch.string))
test_topics = torch.utils.data.Dataset.from_tensor_slices(torch.as_tensor(test_content.features, dtype=torch.string))
test_topics = torch.utils.data.Dataset.from_tensor_slices(torch.as_tensor(test_labels.features, dtype=torch.string))
# test_topics = tf.data.Dataset.from_tensor_slices(tf.cast(test_topics.features, tf.string))
# test_content = tf.data.Dataset.from_tensor_slices(tf.cast(test_content.features, tf.string))
# test_labels = tf.data.Dataset.from_tensor_slices(tf.cast(test_labels, tf.int32))

AttributeError: type object 'Dataset' has no attribute 'from_tensor_slices'