In [4]:

import numpy as np 
import pandas as pd 
import json

train_df = pd.read_csv('/input/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/input/tweet-sentiment-extraction/test.csv')
sub_df = pd.read_csv('/input/tweet-sentiment-extraction/output.csv')

train = np.array(train_df)
test = np.array(test_df)


In [5]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [6]:

def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):

    output = []
    for line in train:
        context = line[1]

        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output

qa_train = do_qa_train(train)

with open('train.json', 'w') as outfile:
    json.dump(qa_train, outfile)

nan <class 'float'>
nan <class 'float'>
neutral <class 'str'>


In [7]:


def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_test = do_qa_test(test)

with open('test.json', 'w') as outfile:
    json.dump(qa_test, outfile)

In [11]:
from simpletransformers.question_answering import QuestionAnsweringModel

MODEL_PATH = '/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'

model = QuestionAnsweringModel('distilbert', 
                               MODEL_PATH, 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 192,
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=True)

model.train_model('train.json')

100%|██████████| 27480/27480 [00:56<00:00, 486.24it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3435.0, style=ProgressStyle(descr…

Running loss: 4.320715



Running loss: 0.685129


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3435.0, style=ProgressStyle(descr…

Running loss: 0.385776


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3435.0, style=ProgressStyle(descr…

Running loss: 1.186762



In [12]:

predictions = model.predict(qa_test)
predictions_df = pd.DataFrame.from_dict(predictions)

sub_df['selected_text'] = predictions_df['answer']

sub_df.to_csv('final_output.csv', index=False)


100%|██████████| 3534/3534 [00:06<00:00, 543.38it/s]


HBox(children=(FloatProgress(value=0.0, max=442.0), HTML(value='')))




In [13]:
sub_df.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy
4,33987a8ee5,i like it!!
...,...,...
3529,e5f0e6ef4b,tired
3530,416863ce47,thanks
3531,6332da480c,sinking
3532,df1baec676,i love
