In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Conversation, pipeline, TrainingArguments, Trainer, Seq2SeqTrainer
from datasets import load_metric
import os

In [2]:
df = pd.read_csv('data/intent_Tweets.csv')

In [3]:
neg_tweets = df[df['airline_sentiment'] == -1]

In [4]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/GODEL-v1_1-large-seq2seq')
model = AutoModelForSeq2SeqLM.from_pretrained('microsoft/GODEL-v1_1-large-seq2seq')


Downloading:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.75G [00:00<?, ?B/s]

In [14]:
def respond_to_tweet(text, tk, m, i, k):
    if k != '':
        k = '[KNOWLEDGE] ' + k
    if text != '':
        text = f'[CONTEXT] {text}'
    query = f'{i} {text} {k}'
    encoding = tk.encode(f'{query}', return_tensors='pt')
    outputs = m.generate(encoding, max_length=140, min_length=8, top_p=.9, do_sample=True)
    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output

In [6]:
# This is the playground to test the base model
instruction = 'Instruction: As an airline help representative, help the person book a new flight'
tweet = f'My flight was cancelled from Houston to New York and I need help getting home'
knowledge = 'there is an available flight at 8pm'
respond_to_tweet(tweet, tokenizer, model, instruction, knowledge)

'You can help you book a new flight. Do this as a passenger to ensure a safe flight home.'

In [12]:
def intent_to_int(intent):
    intents = [
        'Bad Flight',
        'Can\'t Tell',
        'Late Flight',
        'Customer Service Issue',
        'Flight Booking Problem',
        'Lost Luggage',
        'Flight Attendant Complaint',
        'Cancelled Flight',
        'Damaged Luggage'
    ]
    try:
        idx = intents.index(intent)
        return idx
    except:
        return f'Value not found: {intent} is not a valid intent'
        

In [8]:
posttrain_tweets = neg_tweets[neg_tweets['response_label'].notnull()]
posttrain_tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,response_label
0,0,570301031407624196,-1,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,virginamerica it is really aggressive to blas...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),Sorry! We will try to ensure the audio isn’t t...
4,4,570276917301137409,-1,1.0,Bad Flight,1.0,Virgin America,,heatherovieda,,0,virginamerica i flew from nyc to sfo last wee...,,2015-02-24 09:39:46 -0800,this place called NYC,Eastern Time (US & Canada),That’s the luck of the draw I guess.
5,5,570265883513384960,-1,0.6705,Can't Tell,0.3614,Virgin America,,MISSGJ,,0,virginamerica why are your first fares in may...,,2015-02-24 08:55:56 -0800,,,Sorry it’s so expensive right now! Gas prices ...
6,6,570256553502068736,-1,1.0,Customer Service Issue,0.3557,Virgin America,,ayeevickiee,,0,virginamerica you guys messed up my seating i...,,2015-02-24 08:18:51 -0800,714,Mountain Time (US & Canada),So sorry about that! We’ll be sure to make sur...
7,7,570249102404923392,-1,1.0,Customer Service Issue,1.0,Virgin America,,Leora13,,0,virginamerica status match program i applied ...,,2015-02-24 07:49:15 -0800,,,"You should be hearing back soon, thank you for..."


In [9]:
def tokenize_posttrain(posttrain, field):
    return posttrain[field].map(lambda t: tokenizer(t, padding='max_length', truncation=True, return_tensors='pt'))

In [6]:
def create_tweet_input(context, text):
    text = f"[KNOWLEDGE] {text}"
    i = intent_to_int(context)
    if i == 0:
        return f'Instruction: Ask the person how you can help in the future {text} [CONTEXT] {context}'
    elif i == 1:
        return f'Instruction: Ask the person how the airline can do better {text}'
    elif i == 2:
        return f'Instruction: Thank the person for their patience, see if they want to switch flights {text} [CONTEXT] {context}'
    elif i == 3:
        return f'Instruction: See if there is anything you can do to help {text} [CONTEXT] {context}'
    elif i == 4:
        return f'Instruction: Offer to help with booking the flight {text} [CONTEXT] {context}'
    elif i == 5:
        return f'Instruction: Ask the customer where their luggage is coming from and for the receipt number {text} [CONTEXT] {context}'
    elif i == 6:
        return f'Instruction: Ask the customer what flight they were on {text} [CONTEXT] {context}'
    elif i == 7:
        return f'Instruction: Offer to book another flight for the customer {text} [CONTEXT] {context}'
    elif i == 8:
        return f'Instruction: Ask the customer where their luggage is coming from and for the receipt number {text} [CONTEXT] {context}'
    else:
        return f'Instruction: Ask if there is anything you can do'
    

In [11]:
posttrain_tweets['embeddings'] = posttrain_tweets.apply(lambda x: create_tweet_input(x['negativereason'], x['text']), axis=1)
# print(tokenize_posttrain(posttrain_tweets, 'embeddings')[0])
# print(posttrain_tweets['embeddings'].map(lambda text: tokenizer.encode(text)))
data = { 'text': tokenize_posttrain(posttrain_tweets, 'embeddings'),
         'labels': tokenize_posttrain(posttrain_tweets, 'response_label')
       }
t_posttrain = pd.DataFrame(data=data)
t_posttrain.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text,labels
0,"[input_ids, attention_mask]","[input_ids, attention_mask]"
4,"[input_ids, attention_mask]","[input_ids, attention_mask]"
5,"[input_ids, attention_mask]","[input_ids, attention_mask]"
6,"[input_ids, attention_mask]","[input_ids, attention_mask]"
7,"[input_ids, attention_mask]","[input_ids, attention_mask]"


In [12]:
X = t_posttrain['text']
y = t_posttrain['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

[21035,
 10,
 8366,
 8,
 568,
 149,
 25,
 54,
 199,
 16,
 8,
 647,
 784,
 439,
 12038,
 17717,
 5042,
 908,
 24556,
 23064,
 34,
 19,
 310,
 8299,
 12,
 11925,
 3,
 32,
 115,
 19864,
 2936,
 4527,
 16,
 39,
 2554,
 8519,
 11483,
 79,
 43,
 385,
 22975,
 15,
 784,
 17752,
 3463,
 4,
 382,
 908,
 3862,
 16736,
 1]

In [13]:
training_args = TrainingArguments("data", per_device_train_batch_size=1)

In [14]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        
        self.x = x.values
        self.y = y.values
    
    def __len__ (self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        item = { 
            'input_ids': x['input_ids'].squeeze(0),
            'attention_mask': x['attention_mask'].squeeze(0),
            'labels': y['input_ids'].squeeze(0),
        }
        return item

In [15]:
#The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,decoder_input_ids,decoder_attention_mask,head_mask,decoder_head_mask,cross_attn_head_mask,encoder_outputs,past_key_values,inputs_embeds,decoder_inputs_embeds,labels,use_cache,output_attentions,output_hidden_states,return_dict,labels,label_ids,label.

train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

In [16]:
trainer = Seq2SeqTrainer(
       model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset
)

In [17]:
torch.cuda.empty_cache()
# Some issues here w/ CUDA, this will for future use 
trainer.train()

***** Running training *****
  Num examples = 25
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 75
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyError: 'loss'

In [15]:
def response_map(row):
    context = row['negativereason']
    # Were this a real pipeline, context would likely have information related to flights, luggage tracking, etc.
    text = row['text']
    tweet_input = create_tweet_input(context, text)
    response = respond_to_tweet('', tokenizer, model, tweet_input, '')
    return response
    

In [None]:
neg_tweets.loc[0:1000, 'responses'] = neg_tweets.loc[0:1000].apply(response_map, axis=1)

In [23]:
neg_tweets.loc[0:1000, 'responses'] 

0    Do you have any recommendations on what else t...
1                   How are they doing this right now?
Name: responses, dtype: object