In [13]:
import sklearn 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [2]:
df_dreams = pd.read_csv('data/rsos_dream_data.csv')
df_dreams.head(8)
df_dreams.shape

(21001, 21)

In [3]:
df_dreams = df_dreams[['text_dream','emotions_code']].dropna()

In [4]:
df_dreams['text_dream'] = df_dreams['text_dream'].str.lower()

In [None]:
df_dreams.head(8)

In [None]:
df_dreams.info()

#### Splitting emotions_code column to keep only the first emotion code.

In [5]:
df_dreams["emotion_code"] = df_dreams['emotions_code'].str.split(" ", n = 1, expand = True)[0]

In [6]:
df_dreams["emotion_code"].value_counts()

AP             3503
HA             2161
AN             2117
CO             1682
SD             1612
AP1IKA,           2
CO1IKA,           2
AP1IKA            2
MAN1MKA           1
CO1IKA            1
WOMAN2IDA         1
INDIAN1IEA        1
REAGAN1IKA,       1
Name: emotion_code, dtype: int64

In [None]:
df_dreams.shape

In [7]:
target_codes = ["AP", "HA", "AN", "CO", "SD"]

df_dreams = df_dreams[df_dreams['emotion_code'].isin(target_codes)]

In [8]:
categories = {
    'AP': 'Apprehension',
    'HA': 'Happiness',
    'AN': 'Anger',
    'CO': 'Confusion',
    'SD': 'Sadness'
}

print(categories)

{'AP': 'Apprehension', 'HA': 'Happiness', 'AN': 'Anger', 'CO': 'Confusion', 'SD': 'Sadness'}


In [None]:
df_dreams.shape

In [9]:
X = df_dreams['text_dream'].to_list()
y = df_dreams['emotion_code'].to_list()

In [None]:
type(X), len(X)

In [None]:
len(X),len(y)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# print("X_train shape: {}" .format(X_train.shape))
# print("y_train shape: {}" .format(y_train.shape))

In [None]:
# print("X_test shape: {}" .format(X_test.shape))
# print("y_test shape: {}" .format(y_test.shape))

In [None]:
# change above ./shape to len because now we are dealing with lists not dataframes


# ps: you can achieve the same result for preprocessing etc sticking with dataframes
# we just turned everything to list to resemble what you see in the news group example

### Tokenizing text with scikit-learn

In [None]:

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [None]:
X_test_counts = count_vect.transform(X_test)
X_test_counts.shape

In [None]:
type(X_test)

In [None]:
count_vect.vocabulary_.get(u'algorithm')

In [None]:
type(X_train_counts)

### Term frequencies

In [None]:

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


In [None]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
X_test_tfidf.shape

In [None]:
X_train_tfidf.shape

In [None]:
y_train.shape

### Training a classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(X_train_tfidf, y_train)


In [None]:
X_test[5]

In [None]:
sample_dream = X_test[5]
X_new_counts = count_vect.transform([sample_dream])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

#for doc, category in zip(docs_new, predicted):
    #print('%r => %s' % (doc, twenty_train.target_names[category]))

In [None]:
type([sample_dream])

In [None]:
predicted, y_test[5]

#### Building a pipeline

# TODO:

* restart your kernel
* load your data etc
* instead of the above steps for vectorixzatoin and tfidf and training, use a pipeline to do the same, then delete the above steps
* evaluate your performance on your test set
* is the performance good? probably no, use confusion matrix for your classifier as well
* run grid search to improve your model


In [18]:

text_clf = Pipeline([
    ('vect', CountVectorizer(lowercase=True, stop_words="english", strip_accents='unicode', ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_jobs=-1)),
])

In [19]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(ngram_range=(1, 2), stop_words='english',
                                 strip_accents='unicode')),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])

### Evaluating performance of test set

In [20]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.4934537246049661

### Parameter tuning with grid search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'clf__n_estimators': [50, 100, 200, 500],
    'clf__max_features': [2, 4, 6, 8],
}

In [None]:
gs_clf = GridSearchCV(text_clf, param_grid, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

#below output of comp.graphics seems incorrect. in tutorial the output is soc.religion.christian

In [None]:
print("Best parameter (CV score=%0.3f):" % gs_clf.best_score_)
print(gs_clf.best_params_)