In [1]:
import sklearn 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
df_dreams = pd.read_csv('data/rsos_dream_data.csv')
df_dreams.head(8)
df_dreams.shape

(21001, 21)

In [3]:
df_dreams = df_dreams[['text_dream','emotions_code']].dropna()

In [4]:
df_dreams['text_dream'] = df_dreams['text_dream'].str.lower()

In [5]:
df_dreams.head(8)

Unnamed: 0,text_dream,emotions_code
1,i'm at a family reunion in a large fine house ...,SD 2IKA
2,i watch a plane fly past and shortly realize i...,"SD 1ISA, AP D, AP D"
3,me pulling the green leaves and berries off so...,"SD 2ISA, SD D"
4,i'm in a room that reminds me of (but definite...,"AP D, AP D, AP 1MSA, CO D, SD D, AP D"
5,living next door to loretta in an apartment - ...,HA 1FSA
6,kidnapped - i'm on my way somewhere else (by c...,"AN 1ISA, AN D, SD 2ISA, AN D"
7,"i'm alone in an apartment - old place, inside ...","AP D, SD D"
11,me and mb (one of my cats) in a big fancy dark...,"AP 1ISA, SD 2ISA, AP 1IKA"


In [6]:
df_dreams.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11086 entries, 1 to 20999
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_dream     11086 non-null  object
 1   emotions_code  11086 non-null  object
dtypes: object(2)
memory usage: 259.8+ KB


#### Splitting emotions_code column to keep only the first emotion code.

In [7]:
df_dreams["emotion_code"] = df_dreams['emotions_code'].str.split(" ", n = 1, expand = True)[0]

In [8]:
df_dreams["emotion_code"].value_counts()

AP             3503
HA             2161
AN             2117
CO             1682
SD             1612
AP1IKA,           2
CO1IKA,           2
AP1IKA            2
MAN1MKA           1
CO1IKA            1
WOMAN2IDA         1
INDIAN1IEA        1
REAGAN1IKA,       1
Name: emotion_code, dtype: int64

In [9]:
df_dreams.shape

(11086, 3)

In [10]:
target_codes = ["AP", "HA", "AN", "CO", "SD"]

df_dreams = df_dreams[df_dreams['emotion_code'].isin(target_codes)]

In [11]:
categories = {
    'AP': 'Apprehension',
    'HA': 'Happiness',
    'AN': 'Anger',
    'CO': 'Confusion',
    'SD': 'Sadness'
}

print(categories)

{'AP': 'Apprehension', 'HA': 'Happiness', 'AN': 'Anger', 'CO': 'Confusion', 'SD': 'Sadness'}


In [12]:
df_dreams.shape

(11075, 3)

In [13]:
X = df_dreams['text_dream'].to_list()
y = df_dreams['emotion_code'].to_list()

In [14]:
type(X), len(X)

(list, 11075)

In [15]:
len(X),len(y)

(11075, 11075)

In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [17]:
print("X_train shape: {}" .format(len(X_train)))
print("y_train shape: {}" .format(len(y_train)))

X_train shape: 8860
y_train shape: 8860


In [18]:
print("X_test shape: {}" .format(len(X_test)))
print("y_test shape: {}" .format(len(y_test)))

X_test shape: 2215
y_test shape: 2215


### Tokenizing text with scikit-learn

### Term frequencies

### Training a classifier

#### Building a pipeline

# TODO:

* restart your kernel
* load your data etc
* instead of the above steps for vectorixzatoin and tfidf and training, use a pipeline to do the same, then delete the above steps
* evaluate your performance on your test set
* is the performance good? probably no, use confusion matrix for your classifier as well
* run grid search to improve your model


In [19]:
text_clf = Pipeline([
    ('vect', CountVectorizer(lowercase=True, stop_words="english", strip_accents='unicode', ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_jobs=-1)),
])

In [20]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english',
                                 strip_accents='unicode')),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_jobs=-1))])

### Evaluating performance of test set

In [21]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.49435665914221216

### Parameter tuning with grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'clf__n_estimators': [50, 100, 200, 500],
    'clf__max_features': [2, 4, 6, 8],
}

In [None]:
gs_clf = GridSearchCV(text_clf, param_grid, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

#below output of comp.graphics seems incorrect. in tutorial the output is soc.religion.christian

In [None]:
print("Best parameter (CV score=%0.3f):" % gs_clf.best_score_)
print(gs_clf.best_params_)