## Features Extract from UIUC and DISEQuA collections

In [1]:
import pandas as pd
import spacy


nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")
nlp_pt = spacy.load("pt_core_news_sm")
nlp_it = spacy.load("it_core_news_sm")
nlp_nl = spacy.load("nl_core_news_sm")

### DISEQuA

In [2]:
df = pd.read_csv('datasets/DISEQuA/disequa.csv')

#### Pos Tag and Entity Type

In [4]:
pos = []
ent = []

for question, language in zip(df.question, df.language):
    if language == "DUT":
        q = nlp_nl(question)
    elif language == "ENG":
        q = nlp_en(question)
    elif language == "ITA":
        q = nlp_it(question)
    elif language == "SPA":
        q = nlp_es(question)
    else:
        print("Language not found:", language)
        break
    pos.append([w.pos_ for w in q])
    ent.append([w.ent_type_ for w in q])
    
df['pos'] = pos
df['ent'] = ent

#### Save

In [None]:
df.to_csv('datasets/DISEQuA/disequa_features.csv')

### UIUC

#### Pos Tag and Entity Type

In [18]:
for lang in ['en', 'es', 'pt']:
    
    df_train = pd.read_csv('datasets/UIUC_'+lang+'/train.csv')
    df_test = pd.read_csv('datasets/UIUC_'+lang+'/test.csv')
    
    if lang == 'en':
        nlp = nlp_en
    elif lang == 'es':
        nlp = nlp_es
    else:
        nlp = nlp_pt
    
    pos = []
    ent = []
    for question in df_train.question:
        q = nlp(question)
        pos.append([w.pos_ for w in q])
        ent.append([w.ent_type_ for w in q])
    df_train['pos'] = pos
    df_train['ent'] = ent
    df_train.to_csv('datasets/UIUC_'+lang+'/train_features.csv')
    
    pos = []
    ent = []
    for question in df_test.question:
        q = nlp(question)
        pos.append([w.pos_ for w in q])
        ent.append([w.ent_type_ for w in q])
    df_test['pos'] = pos
    df_test['ent'] = ent
    df_test.to_csv('datasets/UIUC_'+lang+'/test_features.csv')