# Enrich test and training datasets

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing
from keyword_transformer import KeywordTransformer
from location_transformer import LocationTransformer
from text_stats_transformer import TextStatsTransformer


In [2]:
df_train = pd.read_csv('./train.csv', index_col='id')
df_test = pd.read_csv('./test.csv', index_col='id')


In [3]:
transformer = compose.ColumnTransformer(transformers=[
    ('keywords', KeywordTransformer(keyword_file_name='./keywords_stats.csv'), ['keyword']),
    ('location', LocationTransformer(), ['location']),
    ('text-features', TextStatsTransformer(), ['text'])
], remainder='passthrough', verbose_feature_names_out=False, verbose=True)

In [4]:
train_enriched = transformer.fit_transform(df_train)

[ColumnTransformer] ...... (1 of 4) Processing keywords, total=   0.0s
[ColumnTransformer] ...... (2 of 4) Processing location, total=  55.6s
[ColumnTransformer] . (3 of 4) Processing text-features, total=  12.5s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s


In [5]:
df_train_enriched = pd.DataFrame(data = train_enriched, columns=transformer.get_feature_names_out())

In [6]:
transformer.get_feature_names_out()

array(['keyword', 'positive_factor', 'location', 'country', 'state',
       'city', 'missing_location', 'text', 'clean_text', 'text_length',
       'upper_text_factor', 'tags_count', 'punct_factor', 'ann_count',
       'urls_count', 'tokens_count', 'stop_words_factor',
       'clean_tokens_factor', 'url_domains', 'url_redirects_count',
       'hashtags_sentiment', 'target'], dtype=object)

In [7]:
df_train_enriched.to_csv('./train_enriched.csv', index_label='id')

In [8]:
test_enriched = transformer.fit_transform(df_test)
df_test_enriched = pd.DataFrame(data = test_enriched, columns=transformer.get_feature_names_out())
df_test_enriched.to_csv('./test_enriched.csv', index_label='id')

[ColumnTransformer] ...... (1 of 3) Processing keywords, total=   0.0s
[ColumnTransformer] ...... (2 of 3) Processing location, total=  19.0s
[ColumnTransformer] . (3 of 3) Processing text-features, total=   3.7s
