In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA

In [2]:
t2idx = [{'I': 0, 'E': 1}, {'N': 0, 'S': 1}, {'T': 0, 'F': 1}, {'J': 0, 'P': 1}]

dataset = pd.read_csv('mbti_1.csv')
posts = list()
types = list()
for i, post in enumerate(dataset['posts']):
    posts.append(post[1:-1].replace('|||', ' '))
    types.append([t2idx[j][dataset['type'][i][j]] for j in range(4)])

Y = np.array(types)

In [3]:
count_vectorizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')
counts = count_vectorizer.fit_transform(posts)
feature_names = count_vectorizer.get_feature_names()

In [4]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(counts)

In [5]:
feature_selection = SelectKBest(chi2)
feature_selection.fit(tfidf.toarray(), Y)
feature_name_score = pd.DataFrame()
feature_name_score['name'] = feature_names
feature_name_score['score'] = feature_selection.scores_
feature_name_score.sort_values(by='score', ascending=False).to_csv('feature_name_score.csv')

In [10]:
k = 16

feature_selection.set_params(k=k)
X = feature_selection.transform(tfidf.toarray())
feature_dataset = pd.DataFrame()
for i in range(4):
    feature_dataset['Y' + str(i)] = Y[:, i]
for i in range(k):
    feature_dataset['X' + str(i)] = X[:, i]
feature_dataset.to_csv('feature_dataset_' + str(k) + '.csv')