# Feature selection

In [1]:
import pandas as pd
import bz2
import json
from os.path import join
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import datasets, cluster
import numpy as np
from grammar_mappings import dtypes

## Data loading

### Speaker features

In [34]:
dir_path = '../../data/speaker_features'
speaker_filename = 'df_speaker_cooked.csv.gzip'

df_speaker = pd.read_csv(join(dir_path, speaker_filename), compression='gzip')
df_speaker.drop(columns=['Unnamed: 0'], inplace=True)
df_speaker.shape

(2654370, 7)

In [35]:
df_speaker.head(5)

Unnamed: 0,id,date_of_birth,nationality,gender,occupation,academic_degree,religion
0,Q42,1952.0,United Kingdom,Male,Arts,,
1,Q207,1946.0,United States of America,Male,Politics,,Christian
2,Q633,1945.0,Canada,Male,Arts,,
3,Q640,1969.0,Germany,Male,Arts,,
4,Q853,1932.0,Soviet Union,Male,Arts,,Christian


In [67]:
len(df_speaker.id.unique())

2654370

### Language features

In [85]:
def merge(df_quotes, df_speaker):
    df_quotes = df_quotes.set_index('qid')
    
    # drop duplicate rows (mostly data headers)
    df_quotes.drop_duplicates(inplace=True)
    df_quotes = df_quotes[df_quotes.quoteID.str.contains('quoteID') == False]
    
    # change value type
    for field in df_quotes.columns:
        df_quotes[field] = df_quotes[field].astype(dtypes[field], errors = 'raise')
    
    # merge df
    df = df_quotes.merge(df_speaker.set_index('id'), left_index=True, right_index=True)
 #   df = df_quotes.merge(df_speaker, left_on='qid', right_on='id')
 
    return df

In [92]:
dir_path = 'D:/ADA_quotebank/language_features'
filename = 'quotes-2020_with_language_feats.csv' # change years
outname = "merged_data_2020_new.csv.gzip" # change years

chunksize = 500000
df = pd.DataFrame()
flag = True
save = True
i = 0

for chunk in pd.read_csv(join(dir_path, filename), chunksize=chunksize, usecols=dtypes.keys(), dtype='O'):
    df = merge(chunk, df_speaker)
    if save:
        if flag:
            df.to_csv(join(dir_path, outname), index=False, compression="gzip", mode='a')
            flag = False
        df.to_csv(join(dir_path, outname), index=False, compression="gzip", mode='a', header=False)
    print(i, df.shape)
    i += 1

merged_data_2020_new.csv.gzip
0 (446142, 31)
merged_data_2020_new.csv.gzip
1 (446168, 31)
merged_data_2020_new.csv.gzip
2 (43253, 31)


In [63]:
# Check files were saved
dir_path = 'D:/ADA_quotebank/language_features'
filename = "merged_data_2015.csv.gzip"

chunksize = 1000000
for chunk in pd.read_csv(join(dir_path, filename), chunksize=chunksize, compression='gzip'):
    df = chunk
    break
    
df.shape

(1000000, 32)

In [93]:
test1 = pd.read_csv(join(dir_path, outname), compression='gzip')

In [94]:
test1.shape

(1381705, 31)

In [95]:
test1.drop_duplicates(inplace=True)
test1.shape

(935563, 31)

In [96]:
test1.to_csv(join(dir_path, outname), index=False, compression="gzip")

In [70]:
test.shape

(9092833, 33)

In [81]:
test.columns

Index(['quoteID', 'qid', 'sentence_count', '._per_sentence', ',_per_sentence',
       '!_per_sentence', '?_per_sentence', ':_per_sentence', ';_per_sentence',
       'sign_per_token', 'punctuation_per_sentence', 'approx_word_count',
       'token_count', 'adj_per_word', 'ordinal_ratio', 'comparative_ratio',
       'superlative_ratio', 'verb_per_word', 'base_ratio', 'pres_ratio',
       'past_ratio', 'pronoun_per_word', 'self_ratio', 'union_ratio',
       'other_ratio', 'sentiment', 'Unnamed: 0', 'date_of_birth',
       'nationality', 'gender', 'occupation', 'academic_degree', 'religion'],
      dtype='object')

In [71]:
test.drop_duplicates(inplace=True)
test.shape

(8648223, 33)

In [97]:
filename = "merged_data_2015_new.csv.gzip"
test.to_csv(join(dir_path, filename), index=False, compression="gzip")

In [64]:
df.drop_duplicates(inplace=True)
df.shape

(553858, 32)

In [38]:
ft_language = ['sentence_count', '._per_sentence', ',_per_sentence', '!_per_sentence', '?_per_sentence', ':_per_sentence',
 ';_per_sentence', 'sign_per_token', 'punctuation_per_sentence', 'approx_word_count', 'token_count', 'adj_per_word',
 'ordinal_ratio', 'comparative_ratio', 'superlative_ratio', 'verb_per_word', 'base_ratio', 'pres_ratio', 'past_ratio',
 'pronoun_per_word', 'self_ratio', 'union_ratio', 'other_ratio', 'sentiment']

ft_speaker = ['date_of_birth', 'nationality', 'gender', 'occupation', 'academic_degree', 'religion']

### Tree-based feature selection

In [39]:
def tree_feature_select(X, y):
    '''
    Does feature selection using decision trees
    
    :param X: features (n_rows, n_features)
    :param y: target (n_rows,)
    :return clf: extra trees classifiers
    :return model: model with reduced features
    '''
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X, y)
    model = SelectFromModel(clf, prefit=True)
    
    return clf, model


def select_predictors(df, ft_language, target):
    '''
    Extracts portion of the df and extracts the most relevant features based on a target speaker attribute
    
    :param df: merged dataframe with language and speaker features
    :param ft_language: list of language features
    :param target: target speaker attribute to predict
    :return clf: extra trees classifier
    :return model: selected model
    :return shape: shape of the data used (some target speaker features have nan which needs to be removed)
    :return X: features used to get model
    :return y: targets used to get model
    '''
    temp = df[ft_language].copy()
    temp[target] = df[target]
    temp.dropna(inplace=True)
    
    X = temp[ft_language].to_numpy()
    y = temp[target].to_numpy()
    clf, model = tree_feature_select(X, y)
    clf.feature_importances_
    
    return clf, model, temp.shape, X, y

#### Gender

In [51]:
target = 'gender'
clf, model, size, X, y = select_predictors(df, ft_language, target)
ft_selected = model.get_feature_names_out(ft_language)
ft_importance = clf.feature_importances_
print('Input data size: ' + str(size))
print(ft_selected)
print(ft_importance)

Input data size: (1000000, 25)
[',_per_sentence' 'sign_per_token' 'approx_word_count' 'token_count'
 'adj_per_word' 'verb_per_word' 'base_ratio' 'pres_ratio' 'past_ratio'
 'pronoun_per_word' 'sentiment']
[0.02130124 0.00715211 0.0458133  0.0056248  0.00507215 0.00443799
 0.00429285 0.07143094 0.00519523 0.06671137 0.07008564 0.07603384
 0.01274571 0.01064285 0.00906929 0.08930027 0.04780305 0.0500179
 0.04409552 0.06293489 0.01288144 0.00988345 0.0116365  0.25583765]


In [52]:
X_new = model.transform(X)
df_new = pd.DataFrame(X_new, columns=ft_selected)
df_new.head(5)

Unnamed: 0,",_per_sentence",sign_per_token,approx_word_count,token_count,adj_per_word,verb_per_word,base_ratio,pres_ratio,past_ratio,pronoun_per_word,sentiment
0,0.0,0.25,9.0,12.0,0.333333,0.333333,-1.0,1.0,-1.0,0.0,-0.478
1,0.0,0.085714,32.0,35.0,0.0625,0.1875,0.0,0.0,-1.0,0.09375,0.301
2,1.0,0.2,8.0,10.0,0.375,0.0,0.0,0.0,0.0,0.0,-0.636
3,1.0,0.041667,23.0,24.0,0.043478,0.217391,0.2,-0.6,-0.6,0.0,0.291
4,0.0,0.066667,14.0,15.0,0.0,0.142857,-1.0,0.0,0.0,0.0,0.0


#### Academic degree

In [53]:
target = 'academic_degree'
clf, model, size, X, y = select_predictors(df, ft_language, target)
ft_selected = model.get_feature_names_out(ft_language)
ft_importance = clf.feature_importances_
print('Input data size: ' + str(size))
print(ft_selected)
print(ft_importance)

Input data size: (66817, 25)
[',_per_sentence' 'sign_per_token' 'approx_word_count' 'token_count'
 'adj_per_word' 'verb_per_word' 'base_ratio' 'pres_ratio' 'past_ratio'
 'pronoun_per_word' 'sentiment']
[0.01902612 0.00742463 0.0499002  0.00439234 0.00526699 0.00643946
 0.00426018 0.07748047 0.00456877 0.07604585 0.07813609 0.0758923
 0.01116957 0.00924575 0.00790786 0.09265254 0.05752249 0.06138377
 0.05186353 0.05350653 0.0152148  0.01376018 0.0160531  0.20088647]


#### Religion

In [54]:
target = 'religion'
clf, model, size, X, y = select_predictors(df, ft_language, target)
ft_selected = model.get_feature_names_out(ft_language)
ft_importance = clf.feature_importances_
print('Input data size: ' + str(size))
print(ft_selected)
print(ft_importance)

Input data size: (228978, 25)
[',_per_sentence' 'sign_per_token' 'approx_word_count' 'token_count'
 'adj_per_word' 'verb_per_word' 'base_ratio' 'pres_ratio' 'past_ratio'
 'pronoun_per_word' 'sentiment']
[0.02118925 0.00911508 0.04787876 0.00448275 0.00693642 0.00479673
 0.00415829 0.0723747  0.00544839 0.07116436 0.07407169 0.07415109
 0.01326976 0.01095889 0.00975449 0.087494   0.05099884 0.0529414
 0.04496076 0.05835626 0.01876152 0.01717369 0.02040654 0.21915636]


#### Occupation

In [55]:
target = 'occupation'
clf, model, size, X, y = select_predictors(df, ft_language, target)
ft_selected = model.get_feature_names_out(ft_language)
ft_importance = clf.feature_importances_
print('Input data size: ' + str(size))
print(ft_selected)
print(ft_importance)

Input data size: (986161, 25)
[',_per_sentence' 'sign_per_token' 'approx_word_count' 'token_count'
 'adj_per_word' 'verb_per_word' 'base_ratio' 'pres_ratio' 'past_ratio'
 'pronoun_per_word' 'sentiment']
[0.02102178 0.00886067 0.04579123 0.00448464 0.00542813 0.00409131
 0.00422546 0.07032331 0.0058153  0.06528612 0.0688696  0.07679083
 0.01463542 0.01229463 0.01040372 0.08600327 0.04751286 0.05153817
 0.04441895 0.0664733  0.0151745  0.01514551 0.01422454 0.24118673]


#### Nationality - does not run because too many nationalities!

In [None]:
target = 'nationality'
clf, model, size, X, y = select_predictors(df, ft_language, target)
ft_selected = model.get_feature_names_out(ft_language)
ft_importance = clf.feature_importances_
print('Input data size: ' + str(size))
print(ft_selected)
print(ft_importance)

### Agglomeration

In [56]:
X = df[ft_language].copy().to_numpy()
X.shape

(1000000, 24)

In [57]:
agglo = cluster.FeatureAgglomeration(n_clusters=None, distance_threshold=120)
agglo.fit(X)
X_reduced = agglo.transform(X)
X_reduced.shape

(1000000, 22)

In [58]:
df_new = pd.DataFrame(X_reduced)
df_new.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.194444,-0.478,1.0,0.0,-1.0,-1.0,-1.0,12.0,0.0,0.0,...,-1.0,9.0,1.0,0.0,1.0,0.0,0.333333,0.0,0.0,0.0
1,0.080655,0.301,0.0,1.0,-1.0,0.0,-1.0,35.0,-1.0,0.0,...,-1.0,32.0,1.0,0.0,1.0,-1.0,0.1875,0.0,0.0,0.0
2,0.191667,-0.636,0.0,0.0,0.0,0.0,-1.0,10.0,0.0,0.0,...,-1.0,8.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.028382,0.291,-0.6,0.0,-0.6,0.2,-1.0,24.0,0.0,0.0,...,-1.0,23.0,1.0,0.0,0.0,0.0,0.217391,1.0,0.0,0.0
4,0.022222,0.0,0.0,0.0,0.0,-1.0,0.0,15.0,0.0,0.0,...,0.0,14.0,0.0,0.0,1.0,0.0,0.142857,0.0,0.0,0.0


## Dictionary - not used

In [None]:
from collections import defaultdict

gender = {
    'Male': 0,
    'Female': 1,
    'Other': 2
}

religion = {
    'Christian': 0,
    'Hindus': 1, 
    'Muslim': 2, 
    'Jewish': 3,
    'Other': 4 
}

academic_degree = {
    'Bachelor': 0, 
    'Master': 1,
    'Doctorate': 2, 
    'Other': 3
}

occupation = {
    'Politics': 0, 
    'Arts': 1, 
    'Military': 2, 
    'Sciences': 3, 
    'Business': 4,
    'Sports': 5, 
    'Religion': 6,
    'Other': 7
}

nationality = defaultdict()
nationalities = df_s.nationality.unique()
for i in range(len(nationalities)):
    nationality[nationalities[i]] = i