In [11]:
import pandas as pd
import os
from torchtext.legacy.data import Field,TabularDataset,BucketIterator
import torch,random
import torch.optim as optim

def sayhi():
	print("hello")

def load_data(url):
	usecol = ["input","output"]
	if not os.path.exists(url):
		print("File path does not exist")
	else:
		if "csv" in url:
			df = pd.read_csv(url,encoding='utf-8',usecols=usecol)
		else:
			df = pd.read_json(url,encoding='utf-8',lines=True)
			df = df[usecol][:100]
		return df	


import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTM(nn.Module):
	def __init__(self, vocab, dimension=128):
		super(LSTM, self).__init__()
		self.embedding = nn.Embedding(vocab, 300)
		self.dimension = dimension
		self.lstm = nn.LSTM(input_size=300,
							hidden_size=dimension,
							num_layers=1,
							batch_first=True,
							bidirectional=True)
		self.drop = nn.Dropout(p=0.5)

		self.fc = nn.Linear(2*dimension, 1)

	def forward(self, text, text_len):

		text_emb = self.embedding(text)

		packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
		packed_output, _ = self.lstm(packed_input)
		output, _ = pad_packed_sequence(packed_output, batch_first=True)

		out_forward = output[range(len(output)), text_len - 1, :self.dimension]
		out_reverse = output[:, 0, self.dimension:]
		out_reduced = torch.cat((out_forward, out_reverse), 1)
		text_fea = self.drop(out_reduced)

		text_fea = self.fc(text_fea)
		text_fea = torch.squeeze(text_fea, 1)
		text_out = torch.sigmoid(text_fea)
		return text_out

def get_dataset_torchtext(url,split):
	df = load_data(url)
	STATS = {}
	STATS["num_labels"] = len(df["output"].unique())
	STATS["total_rows"] = len(df)
	del df
	print("hi")
	label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
	text_field = Field(tokenize='spacy',tokenizer_language = 'en_core_web_sm', lower=True, include_lengths=True, batch_first=True)
	
	# fields = [('output', label_field), ('input', text_field)]
	fields = {'input':("input",text_field),
			'output':("output",label_field)}
	# TabularDataset
	train_data = TabularDataset(path=url,format='JSON', fields=fields)#, skip_header=True)
	print("hi2")
	train_data, valid_data = train_data.split(split_ratio=0.7, random_state = random.seed(13))
	STATS["train_rows"] = len(train_data)
	STATS["test_rows"] = len(valid_data)
	return STATS,train_data,text_field,valid_data

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def lstm_train(STATS,train_data,text_field,valid_data):
	train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.input),
								device=device, sort=True, sort_within_batch=True)
	test_iter = BucketIterator(valid_data, batch_size=32, sort_key=lambda x: len(x.input),
								device=device, sort=True, sort_within_batch=True)
	text_field.build_vocab(train_data, min_freq=3)
	print("Hi da",STATS)
	return len(text_field.vocab),train_iter,test_iter

url = r"D:\TSApy\NoCodeMLdash\data\sequence\seq_sarcasm_train.json"
STATS,train_data,text_field,valid_data = get_dataset_torchtext(url,80)
vocab,train_loader,valid_loader = lstm_train(STATS,train_data,text_field,valid_data) 
model = LSTM(vocab).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)


hi
hi2
Hi da {'num_labels': 2, 'total_rows': 100, 'train_rows': 20033, 'test_rows': 8586}


In [13]:
for x in train_data:
    print(x.input,"\n",x.output,type(x.output))
    break

['pete', 'buttigieg', 'stuns', 'campaign', 'crowd', 'by', 'speaking', 'to', 'manufacturing', 'robots', 'in', 'fluent', 'binary'] 
 1 <class 'int'>


In [22]:
for labels, (text, text_len) in train_loader: 
    print(labels,"\n", (text, text_len))

TypeError: cannot unpack non-iterable NoneType object

In [20]:
for x in train_loader:
    vecs,l = x.input
    op = x.output
    print(vecs.shape)
    print(l.shape)
    print(op,op.shape)
    break

torch.Size([32, 2])
torch.Size([32])
tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0.,
        1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0.]) torch.Size([32])


In [2]:
for i in range(10):
    print(i)

0
1
2
3
4
5
6
7
8
9
