# Enrich test and training datasets

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing
from keyword_transformer import KeywordTransformer
from location_transformer import LocationTransformer
from text_stats_transformer import TextStatsTransformer


In [3]:
df_train = pd.read_csv('./train.csv', index_col='id')
df_test = pd.read_csv('./test.csv', index_col='id')


In [4]:
transformer = compose.ColumnTransformer(transformers=[
    ('keywords', KeywordTransformer(keyword_file_name='./keywords_stats.csv'), ['keyword']),
    ('location', LocationTransformer(), ['location']),
    ('text-features', TextStatsTransformer(), ['text'])
], remainder='passthrough', verbose_feature_names_out=False, verbose=True)

In [5]:
train_enriched = transformer.fit_transform(df_train)

[ColumnTransformer] ...... (1 of 4) Processing keywords, total=   0.0s
[ColumnTransformer] ...... (2 of 4) Processing location, total=  41.1s
[ColumnTransformer] . (3 of 4) Processing text-features, total=   9.4s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s


In [None]:
df_train_enriched = pd.DataFrame(data = train_enriched, columns=transformer.get_feature_names_out())

In [None]:
transformer.get_feature_names_out()

In [None]:
df_train_enriched.to_csv('./train_enriched.csv', index_label='id')

In [None]:
test_enriched = transformer.fit_transform(df_test)
df_test_enriched = pd.DataFrame(data = test_enriched, columns=transformer.get_feature_names_out())
df_test_enriched.to_csv('./test_enriched.csv', index_label='id')