In [4]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier



dataset_filename = 'C:/GitHub/mbti-type/mbti_1.csv'

# ja possui headers 'type' e 'posts'
mbti = pd.read_csv(dataset_filename)
mbti.head()

#replacing separator |||

for i in range(mbti.shape[0]):
    mbti.loc[i, 'posts'] = mbti.loc[i, 'posts'].replace('|||', ' ')


def test_classifier(classifier, name,  X, y, lsa=False):

    tfidf_model = TfidfVectorizer(smooth_idf=True,
                              sublinear_tf=True,
                              lowercase=True,
                              stop_words='english')

    if lsa:
        lsa_model = TruncatedSVD(n_components=50)    

        model= Pipeline([
            ('tfidf', tfidf_model),
            ('lsa', lsa_model),
            ('classifier', classifier)])
    else:
        model= Pipeline([
            ('tfidf', tfidf_model),
            ('classifier', classifier)])
        
    accuracy = cross_val_score(estimator=model,
                               X=X,
                               y=y,
                               scoring='accuracy',
                               cv=5,
                               n_jobs=-1)

        
    return([name, lsa, accuracy.mean()])


classifiers = [
        (GaussianNB(), 'GaussianNB'),
        (SGDClassifier(), 'SGDClassifier'),
        (RandomForestClassifier(), 'RandomForestClassifier'),
        (ExtraTreesClassifier(), 'ExtraTreesClassifier'),
        (DecisionTreeClassifier(), 'DecisionTreeClassifier')
        ]

results = pd.DataFrame(columns=('Classifier', 'LSA', 'Mean accuracy'))
k = 0

for classifier, name in classifiers:
    try:
        results.loc[k] = test_classifier(
                classifier=classifier, 
                name=name, 
                X=mbti['posts'], 
                y=mbti['type'], 
                lsa=False)
        k += 1
        
    except Exception:
        pass

print(results)
    





      Classifier    LSA  Mean accuracy
0  SGDClassifier  False       0.659820
1  SGDClassifier   True       0.577243
