In [1]:
import configuration

from src import tokenizer, features, settings, classification
from sklearn.pipeline import Pipeline, FeatureUnion
from nltk.tokenize import TweetTokenizer
from tqdm import tqdm, tqdm_notebook
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

tqdm.pandas()

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

In [None]:
data = pd.read_csv('../data/selected/Subset_Data_Uniques.csv')

In [None]:
obj_tokenizer = TweetTokenizer()
column, lang = 'fixed_text_expanded', 'lan_final'


sentence_pipeline = Pipeline([
    ('SentenceLevelStats', features.SentenceLevelStats(column)),
    ('FeaturesExistence', features.FeaturesExistence())
])

word_pipeline = Pipeline([
    ('TweetTokenizer', tokenizer.TokenizerLan(obj_tokenizer, column)),
    ('FeaturesExistence', features.WordLevelStats('tokens'))
])

pipeline = Pipeline([
    ('FeatureUnion', FeatureUnion([
        ('sentence_pipeline', sentence_pipeline),
        ('word_pipeline', word_pipeline),
        ('SentimentPolarity', features.SentimentPolarity(column, lang, False)), #last param is_translated 
        ('PartOfSpeech', features.PartOfSpeech(column, lang, False)), #last param is_translated
    ]))
])

statistical_features = pipeline.transform(data)

In [4]:
columns = list(features.SentenceLevelStats(column).build_descriptors().keys())
columns += ['has_{}'.format(w) for w in columns if w.startswith('_')]
columns += list(features.WordLevelStats(column).build_descriptors().keys())
columns[-2] = 'word_{}'.format(columns[-2])
columns += ['polarity']
columns += list(features.PartOfSpeech(column, lang, False).columns.keys()) #last param is_translated

dataframe = pd.DataFrame(statistical_features, columns=columns)

In [5]:
del data['tokens']

In [6]:
final = pd.concat([data, dataframe], axis=1, sort=False)

In [7]:
numeric_cols = ['uniqueness', '_url', '_user',
                '_hashtag', '_cry', '_bless', '_fear', '_sad', '_lol', '_exclamation',
                '_interrogate', 'nchars',
                'word_uniqueness', 'ntokens',
                'polarity', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET',
                'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
                'VERB', 'X', 'I-LOC', 'I-ORG', 'I-PER']
boolean_cols = ['has__url', 'has__user', 'has__hashtag',
                'has__cry', 'has__bless', 'has__fear', 'has__sad', 'has__lol',
                'has__exclamation', 'has__interrogate']

In [8]:
final[numeric_cols] = final[numeric_cols].apply(pd.to_numeric, errors='coerce')
final[boolean_cols] = final[boolean_cols].astype(int)

In [None]:
def binarize_count(number):
    return 1 if number > 0 else 0

final["has__I-LOC"] = final["I-LOC"].apply(binarize_count)
final["has__I-ORG"] = final["I-ORG"].apply(binarize_count)
final["has__I-PER"] = final["I-PER"].apply(binarize_count)

In [12]:
final.to_csv('../data/selected/Subset_Data_Uniques_LF.csv', index=False)