In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier



dataset_filename = 'C:/GitHub/mbti-type/mbti_1.csv'

# ja possui headers 'type' e 'posts'
mbti = pd.read_csv(dataset_filename)
mbti.head()

#replacing separator |||

for i in range(mbti.shape[0]):
    mbti.loc[i, 'posts'] = mbti.loc[i, 'posts'].replace('|||', ' ')


In [2]:
def test_classifier(classifier, name,  X, y, lsa=False):

    tfidf_model = TfidfVectorizer(smooth_idf=True,
                              sublinear_tf=True,
                              lowercase=True,
                              stop_words='english')

    if lsa:
        lsa_model = TruncatedSVD(n_components=50)    

        model= Pipeline([
            ('tfidf', tfidf_model),
            ('lsa', lsa_model),
            ('classifier', classifier)])
    else:
        model= Pipeline([
            ('tfidf', tfidf_model),
            ('classifier', classifier)])
        
    accuracy = cross_val_score(estimator=model,
                               X=X,
                               y=y,
                               scoring='accuracy',
                               cv=5,
                               n_jobs=-1)

        
    return([name, lsa, accuracy.mean()])


In [3]:
# setup
classifiers = [
        (GaussianNB(), 'GaussianNB'),
        (SGDClassifier(), 'SGDClassifier'),
        (RandomForestClassifier(), 'RandomForestClassifier'),
        (ExtraTreesClassifier(), 'ExtraTreesClassifier'),
        (DecisionTreeClassifier(), 'DecisionTreeClassifier')
        ]

results = pd.DataFrame(columns=('Classifier', 'LSA', 'Mean accuracy'))
k = 0



In [4]:

for classifier, name in classifiers:
    try:
        results.loc[k] = test_classifier(
                classifier=classifier, 
                name=name, 
                X=mbti['posts'], 
                y=mbti['type'], 
                lsa=False)
        k += 1
        
    except Exception:
        pass

    try:
        results.loc[k] = test_classifier(
                classifier=classifier, 
                name=name, 
                X=mbti['posts'], 
                y=mbti['type'], 
                lsa=True)
        k += 1
        
    except Exception:
        pass

print(results)


               Classifier    LSA  Mean accuracy
0              GaussianNB   True       0.594357
1           SGDClassifier  False       0.660164
2           SGDClassifier   True       0.572088
3  RandomForestClassifier  False       0.275707
4  RandomForestClassifier   True       0.439191
5    ExtraTreesClassifier  False       0.273415
6    ExtraTreesClassifier   True       0.400218
7  DecisionTreeClassifier  False       0.464427
8  DecisionTreeClassifier   True       0.334520
