In [1]:
#Imports
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Preparing Data

In [2]:
# customized function to sepearate label and text

def read_data(file):
    data = []
    
    with open(file,'r') as f:
        for line in f:
            #print(line)
            line = line.strip()
            print("Line: ",line)
            
            print("Label from [ ]: ",line[1:line.find("]")])
            print("Label stripped for starting and trailing spaces: ",line[1:line.find("]")].strip())
            print("Label split by spaces: ",line[1:line.find("]")].split())
            print("Label joined with a space to convert to string: ",' '.join(line[1:line.find("]")].split()))
            
            label = ' '.join(line[1:line.find("]")].strip().split())
            print("Label Data: ",label)
            print("Label datatype: ",type(label))
            text = line[line.find(']')+1:].strip()
            print("Text Data: ",text)
            data.append([label,text])
    return data

In [3]:
file ='dummy_data.txt'       #'text_emot.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))

Line:  [ 1.  0.  0.  0.  0.  0.  0.] During the period of falling in love, each time that we met and especially when we had not met for a long time.
Label from [ ]:   1.  0.  0.  0.  0.  0.  0.
Label stripped for starting and trailing spaces:  1.  0.  0.  0.  0.  0.  0.
Label split by spaces:  ['1.', '0.', '0.', '0.', '0.', '0.', '0.']
Label joined with a space to convert to string:  1. 0. 0. 0. 0. 0. 0.
Label Data:  1. 0. 0. 0. 0. 0. 0.
Label datatype:  <class 'str'>
Text Data:  During the period of falling in love, each time that we met and especially when we had not met for a long time.
Line:  [ 0.  1.  0.  0.  0.  0.  0.] When I was involved in a traffic accident.
Label from [ ]:   0.  1.  0.  0.  0.  0.  0.
Label stripped for starting and trailing spaces:  0.  1.  0.  0.  0.  0.  0.
Label split by spaces:  ['0.', '1.', '0.', '0.', '0.', '0.', '0.']
Label joined with a space to convert to string:  0. 1. 0. 0. 0. 0. 0.
Label Data:  0. 1. 0. 0. 0. 0. 0.
Label datatype:  <class 'str'>

# Tokenization and Generating Features

N-grams are continuous sequences of words or symbols, or tokens in a document. In technical terms, they can be defined as the neighboring sequences of items in a document

So, [^a-z0-9#] will match any single character that is not a lowercase letter, digit, or hash symbol.

Special characters are replaced with space and only lowercase a-z, numbers 0-9, hash symbol retained

<b>For example:</b>

Before:
during the period of falling in love, each time that we met and especially when we had not met for a long time.

After, re.sub[^a-z0-9#] = comma and full stop at the end is removed and space added

during the period of falling in love  each time that we met and especially when we had not met for a long time


text_punc = re.sub('[a-z0-9]',' ',text)

In the above statement, where letters are lowercase/numbers are replaced with space and special characters such as ',' , '.' , '/' etc are retained/returned to text_punc

<b>nrange</b>

This variable which is tuple will suggest how many number of words to be grouped for creating features starting from 1 word to infinity

For example:

nrange=(1,1) # Means feature with one word to be created

Text_Features = ['when', 'i', 'was', 'involved', 'in', 'a', 'traffic', 'accident']

nrange = (1,2) # Here 1 word feature and 2 words features are created

Text Features:  ['when', 'i', 'was', 'involved', 'in', 'a', 'traffic', 'accident', 'when i', 'i was', 'was involved', 'involved in', 'in a', 'a traffic', 'traffic accident']


In [4]:
#Customized function for tokenization and feature generation
def ngram(token,n):
    output = []
    print("Token:",token)
    for i in range(n-1,len(token)):
        print(i)
        ngram = ' '.join(token[i-n+1:i+1])
        print("Ngram Words: ",ngram)
        output.append(ngram)
    return output

def create_feature(text, nrange=(1,1)):
    text_features = []
    text = text.lower()
    print("Text after lowercase: ",text)
    
    text_alphanum = re.sub('[^a-z0-9#]',' ',text)
    print("Text after retaining alphanumeric: ",text_alphanum)
    
    print(nrange[0],nrange[1])
    for n in range(nrange[0],nrange[1]+1):
        print(n)
        
        text_features += ngram(text_alphanum.split(),n)
        print("Text Features: ",text_features)
        
    text_punc = re.sub('[a-z0-9]',' ',text)
    print("Punctuation: ",text_punc)
    
    
    text_features += ngram(text_punc.split(),1)
    print("Text Features with punctuation: ",text_features)
    
    return Counter(text_features)       

In [5]:
type(data)

list

In [6]:
'''for _,text in data:
    create_feature(text)'''

'for _,text in data:\n    create_feature(text)'

# Store and convert labels to Emotions

In [7]:
# Customize function to convert labels to emotions
def convert_label(item,name):
    print("Item: ",item)
    print("Emotions: ",name)
    
    items = list(map(float, item.split()))
    print("Labels converted to float list: ",items)
    
    label = ""
    
    for idx in range(len(items)):
        print(idx)
        
        if items[idx] == 1:
            label += name[idx] +" "
    
    print("Label with indicies: ",label)
    return label.strip()            

In [8]:
emotions = ["joy","fear","anger","sadness", "disgust", "shame", "guilt"]

X_all = []
y_all = []

for label,text in data:
    #label = list(map(int,label.split()))
    #print(type(label))
    y_all.append(convert_label(label,emotions))
    X_all.append(create_feature(text,nrange=(1,2)))

Item:  1. 0. 0. 0. 0. 0. 0.
Emotions:  ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']
Labels converted to float list:  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0
1
2
3
4
5
6
Label with indicies:  joy 
Text after lowercase:  during the period of falling in love, each time that we met and especially when we had not met for a long time.
Text after retaining alphanumeric:  during the period of falling in love  each time that we met and especially when we had not met for a long time 
1 2
1
Token: ['during', 'the', 'period', 'of', 'falling', 'in', 'love', 'each', 'time', 'that', 'we', 'met', 'and', 'especially', 'when', 'we', 'had', 'not', 'met', 'for', 'a', 'long', 'time']
0
Ngram Words:  during
1
Ngram Words:  the
2
Ngram Words:  period
3
Ngram Words:  of
4
Ngram Words:  falling
5
Ngram Words:  in
6
Ngram Words:  love
7
Ngram Words:  each
8
Ngram Words:  time
9
Ngram Words:  that
10
Ngram Words:  we
11
Ngram Words:  met
12
Ngram Words:  and
13
Ngram Words:  especially
14
Ngram

In [9]:
X_all

[Counter({'during': 1,
          'the': 1,
          'period': 1,
          'of': 1,
          'falling': 1,
          'in': 1,
          'love': 1,
          'each': 1,
          'time': 2,
          'that': 1,
          'we': 2,
          'met': 2,
          'and': 1,
          'especially': 1,
          'when': 1,
          'had': 1,
          'not': 1,
          'for': 1,
          'a': 1,
          'long': 1,
          'during the': 1,
          'the period': 1,
          'period of': 1,
          'of falling': 1,
          'falling in': 1,
          'in love': 1,
          'love each': 1,
          'each time': 1,
          'time that': 1,
          'that we': 1,
          'we met': 1,
          'met and': 1,
          'and especially': 1,
          'especially when': 1,
          'when we': 1,
          'we had': 1,
          'had not': 1,
          'not met': 1,
          'met for': 1,
          'for a': 1,
          'a long': 1,
          'long time': 1,
          ',': 1,
    

In [10]:
y_all

['joy', 'fear', 'anger', 'sadness', 'disgust']

# Splitting Data into Training and Test data


The class DictVectorizer can be used to convert feature arrays represented as lists of standard Python dict objects to the NumPy/SciPy representation used by scikit-learn estimators.

While not particularly fast to process, Python’s dict has the advantages of being convenient to use, being sparse (absent features need not be stored) and storing feature names in addition to values.

DictVectorizer implements what is called one-of-K or “one-hot” coding for categorical (aka nominal, discrete) features. Categorical features are “attribute-value” pairs where the value is restricted to a list of discrete of possibilities without ordering (e.g. topic identifiers, types of objects, tags, names…).

In the following, “city” is a categorical attribute while “temperature” is a traditional numerical feature:

measurements = [

{'city': 'Dubai', 'temperature': 33.},

{'city': 'London', 'temperature': 12.},

{'city': 'San Francisco', 'temperature': 18.},

]

from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()

vec.fit_transform(measurements).toarray()

array([[ 1., 0., 0., 33.],

[ 0., 1., 0., 12.],

[ 0., 0., 1., 18.]])

vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

Three first places are cities: 100 Dubai, 010 London, 001 Frisco

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X_all,y_all,test_size=0.1,random_state=123)

In [12]:
X_train

[Counter({'when': 1,
          'i': 1,
          'lost': 1,
          'the': 2,
          'person': 1,
          'who': 1,
          'meant': 1,
          'most': 1,
          'to': 1,
          'me': 1,
          'when i': 1,
          'i lost': 1,
          'lost the': 1,
          'the person': 1,
          'person who': 1,
          'who meant': 1,
          'meant the': 1,
          'the most': 1,
          'most to': 1,
          'to me': 1,
          '.': 1}),
 Counter({'the': 7,
          'time': 1,
          'i': 1,
          'knocked': 1,
          'a': 1,
          'deer': 1,
          'down': 2,
          'sight': 1,
          'of': 2,
          'animal': 3,
          's': 1,
          'injuries': 1,
          'and': 2,
          'helplessness': 1,
          'realization': 1,
          'that': 2,
          'was': 1,
          'so': 1,
          'badly': 1,
          'hurt': 1,
          'it': 1,
          'had': 1,
          'to': 1,
          'be': 1,
          'put': 1,
 

In [13]:
X_test

[Counter({'when': 1,
          'i': 1,
          'was': 1,
          'involved': 1,
          'in': 1,
          'a': 1,
          'traffic': 1,
          'accident': 1,
          'when i': 1,
          'i was': 1,
          'was involved': 1,
          'involved in': 1,
          'in a': 1,
          'a traffic': 1,
          'traffic accident': 1,
          '.': 1})]

In [14]:
#Customized function to run multiple models and note accuracy
def train_test(clf,X_train,X_test,y_train,y_test):
    clf.fit(X_train,y_train)
    train_acc = accuracy_score(y_train,clf.predict(X_train))
    test_acc = accuracy_score(y_test,clf.predict(X_test))
    return train_acc,test_acc


from sklearn.feature_extraction import DictVectorizer
vect = DictVectorizer(sparse=True)
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [15]:
X_train.toarray()

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
        1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0.,
        0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0.],
       [1., 1., 1., 2., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 2.,
        0., 1., 0., 1., 3., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0.,
        0., 1., 1., 1., 0., 0., 2., 1., 1., 0., 0., 0., 0., 0.,

In [16]:
X_test.toarray()

array([[0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.]])

# Model Training and Testing

In [17]:
svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()

clifs = [svc,lsvc,rforest,dtree]

#train and test them
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))

for clf in clifs:
    clf_name = clf.__class__.__name__
    train_acc,test_acc=train_test(clf,X_train,X_test,y_train,y_test)
    print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |
| SVC                       |         1.0000000 |     0.0000000 |
| LinearSVC                 |         1.0000000 |     0.0000000 |
| RandomForestClassifier    |         1.0000000 |     0.0000000 |
| DecisionTreeClassifier    |         1.0000000 |     0.0000000 |


# Frequency of Labels

In [18]:
l = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

l.sort()
label_freq = {}

for label,_ in data:
    label_freq[label] = label_freq.get(label,0) + 1

#Rewritten function without print statements
def convert_label(item,name):        
    items = list(map(float, item.split()))    
    
    label = ""
    
    for idx in range(len(items)):        
        
        if items[idx] == 1:
            label += name[idx] +" "
    
    return label.strip()
    
# print the labels and their counts in sorted order
for l in sorted(label_freq,key=label_freq.get,reverse=True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

joy       (1. 0. 0. 0. 0. 0. 0.)  1
fear      (0. 1. 0. 0. 0. 0. 0.)  1
anger     (0. 0. 1. 0. 0. 0. 0.)  1
sadness   (0. 0. 0. 1. 0. 0. 0.)  1
disgust   (0. 0. 0. 0. 1. 0. 0.)  1


# Detecting Emotions

In [19]:
#Repeated function withput print statements
def ngram(token,n):
    output = []    
    for i in range(n-1,len(token)):        
        ngram = ' '.join(token[i-n+1:i+1])        
        output.append(ngram)
    return output

def create_feature(text, nrange=(1,1)):
    text_features = []
    text = text.lower()   
    
    text_alphanum = re.sub('[^a-z0-9#]',' ',text)    
    
    for n in range(nrange[0],nrange[1]+1):                
        text_features += ngram(text_alphanum.split(),n)        
        
    text_punc = re.sub('[a-z0-9]',' ',text)    
    
    text_features += ngram(text_punc.split(),1)    
    
    return Counter(text_features) 

# Giving wrong prediction


In [20]:
svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()

clifs = [svc,lsvc,rforest,dtree]

emoji_dict = {"joy":"😂", "fear":"😱", "anger":"😠", "sadness":"😢", "disgust":"😒", "shame":"😳", "guilt":"😳"}

t1 = "This looks so impressive"
t2 = "I have a fear of dogs"
t3 = "My dog died yesterday"
t4 = "I don't love you anymore..!"

texts = [t1, t2, t3, t4]

In [21]:
for text in texts: 
    features = create_feature(text, nrange=(1, 4))
    features = vect.transform(features)
    print(clf)
    prediction = clf.predict(features)[0]
    print( text,emoji_dict[prediction])

DecisionTreeClassifier()
This looks so impressive 😂
DecisionTreeClassifier()
I have a fear of dogs 😂
DecisionTreeClassifier()
My dog died yesterday 😂
DecisionTreeClassifier()
I don't love you anymore..! 😂
