In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

In [25]:
train_df = pd.read_csv('../Data/train_processed.csv')
val_df = pd.read_csv('../Data/var_processed.csv')
test_df = pd.read_csv('../Data/test_processed.csv') 

In [26]:
train_df.head()

Unnamed: 0,asin,category,questionText,questionType,review_snippets,answers,is_answerable,qid
0,B000MP20BU,Toys_and_Games,"Many have stated similar to the following: ""Pa...",descriptive,"[""A lot of reviewers have said things about th...","[{'answerText': ""The paint has held up through...",1.0,0.0
1,B00BOXZZU2,Health_and_Personal_Care,Will these work with the Phillips sonicare han...,descriptive,"[""I didn't even realize such a small electric ...","[{'answerText': 'The answer unfortunately, is ...",0.0,1.0
2,B00CSYD4M2,Cell_Phones_and_Accessories,What kind of sim card it use?,descriptive,['I bought this phone a few weeks ago.I am usi...,[{'answerText': 'This phone is an unlocked GSM...,1.0,2.0
3,B00C5TNSRG,Home_and_Kitchen,does anyone know if this dinnerware set does n...,descriptive,"[""I love my new dishes! They are so versatile....",[{'answerText': 'According to the internet sea...,0.0,3.0
4,B0099XQBD4,Musical_Instruments,I'm thinking of getting in to modular synthesi...,descriptive,['Will order another in the near future and ar...,"[{'answerText': 'Yes it will.', 'answerType': ...",0.0,4.0


In [27]:
val_df.drop([x for x in val_df.columns if x not in train_df.columns],axis=1,inplace=True)

In [28]:
test_df.drop([x for x in test_df.columns if x not in train_df.columns],axis=1,inplace=True)

In [29]:
train_df.dropna(subset=['is_answerable'],inplace=True)

In [30]:
test_df.dropna(subset=['is_answerable'],inplace=True)
val_df.dropna(subset=['is_answerable'],inplace=True)

In [12]:
def convert_tolst(df):
    df = df.copy()
    df['review_snippets'] = df['review_snippets'].apply(lambda x: np.array(x.replace('[','').replace(']','').replace('\n','').replace("'",'').split(' ')))
    df['answers'] = df['answers'].apply(lambda x: np.array(x.replace('[','').replace(']','').replace('\n','').replace("'",'').split(' ')))
    return df

train_df = convert_tolst(train_df)
val_df = convert_tolst(val_df)
test_df = convert_tolst(test_df)

In [31]:
train_df.to_csv('../Data/train.csv',index=False)
val_df.to_csv('../Data/val.csv',index=False)
test_df.to_csv('../Data/test.csv',index=False)
pd.concat([train_df,val_df,test_df]).to_csv('../Data/all.csv',index=False)

In [18]:
model_data = train_df[['review_snippets','questionText','is_answerable']].copy()
model_data['review_snippets'] = model_data['review_snippets'].apply(lambda x: '    '.join(x)) #4 spaces
model_data['is_answerable'].value_counts()

1.0    450484
0.0    279584
Name: is_answerable, dtype: int64

In [19]:
X = model_data.drop('is_answerable',axis=1)
y = model_data['is_answerable'].values

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
from nltk import tokenize
import pickle
import transformers
import torch

In [21]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tfidf_vectorizer = TfidfVectorizer(stop_words ='english')
sentence_t = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [22]:
tfidf_vectorizer.fit(X['questionText'].tolist() + X['review_snippets'].tolist())

In [23]:
def add_features(df):
  #HELPERS
  def intersec(q,r):
    return len([x for x in q if x in r])
  
  def vectorize(text):
    return tfidf_vectorizer.transform([text]).toarray()[0]

  def tokenize(text):
    return tokenizer(text,max_length=512,truncation=True).get('input_ids')
  
  def sentenceTransform(text):
    return sentence_t.encode(text)

  def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a) * np.linalg.norm(b))
  
  def dotproduct(a,b):
    return np.dot(a,b)
  
  def euclid_dist(a,b):
    return np.linalg.norm(a-b)
  
  def sentence_mean_max(model,q,r):
    f = dotproduct
    vals = [f(q,model(sentence)) for sentence in tokenize.sent_token]
    return pd.Series([np.mean(vals), np.max(vals)])

  df = df.copy()
  df['question_ntokens'] = df['questionText'].apply(lambda x:len(tokenize(x)))
  df['review_ntokens'] = df['review_snippets'].apply(lambda x:len(tokenize(x)))
  df['question_tokens'] = df['questionText'].apply(lambda x:tokenize(x))
  df['review_tokens'] = df['review_snippets'].apply(lambda x:tokenize(x))
  df['intersec'] = df.apply(lambda row: intersec(row['questionText'], row['review_snippets']),axis=1)
  df['intersec_pct'] = df['intersec'] / df['question_ntokens']
  df['question_encoded'] = df['questionText'].apply(lambda x: sentenceTransform(x))
  df['review_encoded'] = df['review_snippets'].apply(lambda x: sentenceTransform(x))
  df['question_tfidf'] = df['questionText'].apply(lambda x: vectorize(x))
  df['review_tfidf'] = df['review_snippets'].apply(lambda x: vectorize(x))

  for m in ['encoded','tfidf']:
    df[f'cosine_sim_{m}'] = df.apply(lambda row: cosine_similarity(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
    df[f'dot_prod_{m}'] = df.apply(lambda row: dotproduct(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
    df[f'euclid_dist_{m}'] = df.apply(lambda row: euclid_dist(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
  df[['sent_max_encoded','sent_mean_encoded']] = df.apply(lambda row: sentence_mean_max(sentenceTransform, row['question_encoded'], row['review_snippets']),axis =1 )
  df[['sent_max_tfidf','sent_mean_tfidf']] = df.apply(lambda row: sentence_mean_max(vectorize, row['question_tfidf'], row['review_snippets']),axis =1 )

  df.drop(['question_tokens','review_tokens','question_encoded','review_encoded','question_tfidf','review_tfidf'],axis =1 , inplace = True)
  return df
