<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import joblib




In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv")

# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
device

'cuda'

In [6]:
def get_weighted_embedding(texts, word2tfidf, model, batch_size):
    if isinstance(texts, str):
        texts = [texts]
        
    all_embeddings = []
    texts_len = len(texts)
    for i in range(0, texts_len, batch_size):
        # print(f'Processing batch: {i // batch_size}')
        batch = texts[i:i+batch_size]
        # Compute average TF-IDF weight per text
        avg_weights = []
        for text in batch:
            words = text.split()
            if words:
                weights = [word2tfidf.get(w, 0) for w in words]
                avg_weights.append(sum(weights) / len(weights))
            else:
                avg_weights.append(0.0)
        avg_weights = torch.tensor(avg_weights, device=device)
        
        # Encode batch to sentence embeddings
        with torch.no_grad():
            embeddings = model.encode(batch, convert_to_tensor=True, device=device)

        # Scale embeddings by average weights
        weighted_embeddings = embeddings * avg_weights.unsqueeze(1)
        all_embeddings.extend(weighted_embeddings.detach().cpu().numpy())
        try:
            del batch, embeddings, weighted_embeddings
        except NameError:
            pass

    # print(np.array(all_embeddings).shape, type(all_embeddings), type(all_embeddings[0]))
    return all_embeddings


In [7]:
if os.path.isfile('nlp_vector_dataframe.joblib'):
    print('Loading vector dataframe from the file.')
    df = joblib.load('nlp_vector_dataframe.joblib')
else:
    df['qu1_feats_m'] = get_weighted_embedding(
        df['question1'].values, word2tfidf, model, batch_size=2**13)
    df['qu2_feats_m'] = get_weighted_embedding(
        df['question2'].values, word2tfidf, model, batch_size=2**13)
    df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1,inplace=True)
    joblib.dump(df, 'nlp_vector_dataframe.joblib')

In [8]:
print(df['qu1_feats_m'].shape)
print(df['qu2_feats_m'].shape)

(404290,)
(404290,)


In [9]:
print(df['qu1_feats_m'][0].shape)
print(df['qu2_feats_m'][0].shape)

(384,)
(384,)


In [10]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [11]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df
df3_q1 = pd.DataFrame(df3['qu1_feats_m'].values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3['qu2_feats_m'].values.tolist(), index= df3.index)

In [12]:
# dataframe of nlp features
df1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,63,63,43,47,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,28,24,9,14,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,35,56,0.175


In [13]:
# data before preprocessing 
df2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.2,5,3
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19.0,0.0,2,0
4,4,3,1,76,39,13,7,2.0,20.0,0.1,4,2


In [14]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.320846,-0.186737,-0.287031,0.03515,-0.276487,0.257875,0.084914,0.116474,-0.499918,-0.104155,...,-0.235298,0.379984,-0.405718,0.040615,-0.133902,-0.235007,0.003497,-0.197978,-0.309128,0.177339
1,-0.162865,0.539825,-0.136423,0.169766,-0.18727,-0.038557,-0.115679,-0.172218,-0.127867,-0.080888,...,-0.11957,-0.029279,-0.072696,-0.007461,-0.205621,-0.120081,0.282377,-0.115446,0.008669,0.139131
2,-0.111571,0.146567,-0.001577,0.070013,-0.302598,0.012227,0.164734,0.088041,0.068868,-0.18042,...,-0.104089,-0.298222,0.218934,-0.090424,-0.325695,0.448379,-0.33801,-0.263089,0.01635,-0.065452
3,0.214366,-0.191116,0.155799,0.366758,0.126644,0.079277,0.335356,0.119631,0.248407,-0.112345,...,0.012611,0.189038,0.192116,0.403073,0.172075,0.039224,0.208773,0.104064,-0.268618,-0.315431
4,-0.28526,-0.149384,-0.211819,-0.013051,0.452586,-0.248367,0.280504,0.349548,0.008555,-0.129971,...,0.134815,0.179587,-0.070858,-0.199728,-0.317968,-0.167555,0.561263,-0.228302,0.663638,-0.302125


In [15]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.423212,-0.164554,-0.270203,0.10472,-0.216234,0.280429,0.047632,0.067348,-0.476417,0.013434,...,-0.171368,0.291435,-0.217767,0.099255,-0.071048,-0.17797,-0.037065,-0.205337,-0.267059,0.197239
1,-0.391221,0.747902,-0.065154,-0.177288,-0.066026,-0.031845,-0.18504,-0.287023,-0.160371,0.101577,...,0.243118,0.172128,0.005463,0.377117,-0.371125,-0.119946,0.246193,-0.60637,-0.110116,-0.052228
2,-0.022114,-0.307914,0.281715,-0.186233,-0.593995,-0.194735,-0.066883,-0.252384,0.367712,-0.024775,...,-0.222976,0.040296,-0.012184,-0.021796,-0.325127,0.352813,-0.202566,-0.261915,0.147404,0.025606
3,0.234764,0.252844,-0.162482,0.058531,-0.048424,-0.179547,0.136423,0.190352,0.121681,0.023969,...,0.130749,-0.102055,-0.085348,-0.113203,0.071612,0.277859,-0.029385,-0.032677,0.244178,-0.313774
4,-0.305681,0.254926,-0.127245,-0.149993,-0.017058,-0.037423,0.091251,-0.127722,-0.181144,-0.279247,...,-0.24317,0.2318,0.143391,-0.209443,-0.718074,0.253778,-0.082368,0.072807,-0.012569,0.523109


- 11 features + id = 12 basic features
- 15 adv. features + id + is_duplicate = 17 adv. features

In [17]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 384
Number of features in question2 w2v  dataframe : 384
Number of features in final dataframe  : 797


In [None]:
df3_q1['id']=df1['id']
df3_q2['id']=df1['id']
df1  = df1.merge(df2, on='id',how='left')
df2  = df3_q1.merge(df3_q2, on='id',how='left')
result  = df1.merge(df2, on='id',how='left')
if not os.path.isfile('final_features.joblib'):
    joblib.dump(result, 'final_features_dataframe.joblib')

In [19]:
result.shape
# id is same for all dfs, so the final shape has 1 column less
# than the sum of individual shapes after the merge

(404290, 796)

In [20]:
result.columns

Index(['id', 'is_duplicate', 'cwc_min', 'cwc_max', 'csc_min', 'csc_max',
       'ctc_min', 'ctc_max', 'last_word_eq', 'first_word_eq',
       ...
       '374_y', '375_y', '376_y', '377_y', '378_y', '379_y', '380_y', '381_y',
       '382_y', '383_y'],
      dtype='object', length=796)

In [22]:
len(result.columns)

796