# Serious Machine Learning The More Correct Way

### Importing data
#### Dataset from Kaggle: https://www.kaggle.com/datasnaek/mbti-type
##### This data was collected from an internet forum focused on personality types. As such, the text contains a much greater number of explicit mentions of these personality types as I would expect to find "in the wild." Because of this, my expectation is that any model trained from this data will not generalize well to other, broader sources.

In [1]:
import pandas as pd
df = pd.read_csv("./data/mbti-type.zip", compression="zip")
print(df.shape)
df.head()

(8675, 2)


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
# Preprocessing
from nltk.corpus import stopwords as nltk_stopwords

type_stopwords = list(map(lambda x: x.lower(), df.type.unique()))
plural_type_stopwords = [x + "s" for x in type_stopwords]

all_stopwords = type_stopwords + plural_type_stopwords + list(nltk_stopwords.words('english'))

In [3]:
# Train/Test Splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.posts, 
                                                    df.type, 
                                                    test_size=0.2, 
                                                    random_state=501)

In [4]:
# Vectorization (using simple count vectorization for now)
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df.posts)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
# Feature selection using simple variance
from sklearn.feature_selection import VarianceThreshold

X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

thresholdValue = 0.95
selector = VarianceThreshold(threshold=(thresholdValue * (1 - thresholdValue)))

X_train_selected = selector.fit_transform(X_train_vec)
X_test_selected = selector.transform(X_test_vec)

print(X_train_selected.shape)
print(X_test_selected.shape)

(6940, 2418)
(1735, 2418)


In [6]:
# Grid search a couple of important parameters in a SVM classifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

parameters_to_tune = [
    {
        "max_depth": range(2,20, 2)
    }
]

grid_search = GridSearchCV(RandomForestClassifier(),
                          parameters_to_tune,
                          cv = 3, # number of "folds" in cross-validation
                          scoring = "precision_micro")

grid_search.fit(X_train_selected, y_train)

  from numpy.core.umath_tests import inner1d


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': range(2, 20, 2)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision_micro', verbose=0)

In [7]:
from sklearn.metrics import classification_report

print(grid_search.best_params_)
y_true, y_pred = y_test, grid_search.predict(X_test_selected)
print(classification_report(y_true, y_pred))

{'max_depth': 12}
             precision    recall  f1-score   support

       ENFJ       0.67      0.06      0.11        33
       ENFP       0.58      0.21      0.30       136
       ENTJ       0.00      0.00      0.00        43
       ENTP       0.53      0.32      0.40       128
       ESFJ       0.00      0.00      0.00         8
       ESFP       0.00      0.00      0.00        10
       ESTJ       0.00      0.00      0.00         6
       ESTP       0.00      0.00      0.00        21
       INFJ       0.43      0.49      0.46       307
       INFP       0.42      0.80      0.55       380
       INTJ       0.44      0.45      0.45       214
       INTP       0.41      0.49      0.45       249
       ISFJ       0.50      0.03      0.05        38
       ISFP       0.50      0.04      0.07        52
       ISTJ       0.00      0.00      0.00        45
       ISTP       0.69      0.17      0.27        65

avg / total       0.43      0.44      0.39      1735



  'precision', 'predicted', average, warn_for)
