-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_feature_vector.py
183 lines (152 loc) · 5.87 KB
/
build_feature_vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#! #!/usr/bin/python
# -*- coding: utf-8 -*-
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
import numpy as np
import operator
import pickle
import preprocessing as pre
import re
import string
global char_n_grams_index, word_n_grams_index, hate_words_index
def AddEmoticonFeatures(feature_vector, happy_emoticon, sad_emoticon,
disgust_emoticon, anger_emoticon, fear_emoticon,
surprise_emoticon):
count_happy = count_sad = count_disgust = count_anger = count_fear = count_surprise = 0
#print "AddEmoticonFeatures Called"
for emoticon in pre.all_emoticons:
if emoticon in happy_emoticon:
count_happy = count_happy + 1
elif emoticon in sad_emoticon:
count_sad = count_sad + 1
elif emoticon in anger_emoticon:
count_anger = count_anger + 1
'''
elif emoticon in disgust_emoticon:
count_disgust = count_disgust + 1
elif emoticon in fear_emoticon:
count_fear = count_fear + 1
elif emoticon in surprise_emoticon:
count_surprise = count_surprise + 1
'''
feature_vector.append(count_happy)
feature_vector.append(count_sad)
#feature_vector.append(count_disgust)
feature_vector.append(count_anger)
#feature_vector.append(count_fear)
#feature_vector.append(count_surprise)
return feature_vector
def AddCharNGramFeatures(feature_vector, char_n_grams_index, char_n_grams):
char_features = [0]*len(char_n_grams_index)
#print "AddCharNGramFeatures Called"
for char_gram in char_n_grams:
if char_gram in char_n_grams_index:
char_features[char_n_grams_index[char_gram]] = 1
feature_vector.extend(char_features)
return feature_vector
def AddWordNGramFeatures(feature_vector, word_n_grams_index, word_n_grams):
word_features = [0]*len(word_n_grams_index)
#print "AddWordNGramFeatures Called"
for word_gram in word_n_grams:
if word_gram in word_n_grams_index:
word_features[word_n_grams_index[word_gram]] = 1
feature_vector.extend(word_features)
return feature_vector
def AddHateWordsFeature(feature_vector, hate_words_index, tweet_hate_words):
hate_feature = [0]*len(hate_words_index)
#print len(hate_feature)
for hate_word in tweet_hate_words:
if hate_word in hate_words_index:
#print hate_word
hate_feature[hate_words_index[hate_word]] = 1
#print hate_words_index[hate_word]
feature_vector.extend(hate_feature)
return feature_vector
def AddPunctuationMarksFeature(feature_vector, punctuations_marks_count):
for punctuation in pre.punctuations_marks:
if punctuation in punctuations_marks_count:
feature_vector.append(1)
else:
feature_vector.append(0)
feature_vector.append(len(punctuations_marks_count))
return feature_vector
def AddRepetitiveWordsFeature(feature_vector, repetitive_words):
#print "AddRepetitiveWordsFeature Called"
if len(repetitive_words) > 0:
feature_vector.append(1)
else:
feature_vector.append(0)
#feature_vector.append(len(repetitive_words))
return feature_vector
def AddUpperCaseWordsFeature(feature_vector, upper_case_words):
if len(upper_case_words) > 0:
feature_vector.append(1)
else:
feature_vector.append(0)
#feature_vector.append(len(upper_case_words))
return feature_vector
def AddIntensifersFeature(feature_vector, intensifiers):
if len(intensifiers) > 0:
feature_vector.append(1)
else:
feature_vector.append(0)
#feature_vector.append(len(intensifiers))
return feature_vector
def AddNegationsFeature(feature_vector, negations):
if len(negations) > 0:
feature_vector.append(1)
else:
feature_vector.append(0)
#feature_vector.append(len(negations))
return feature_vector
def BuildFeatureVectorForTweet(tweet):
#print "BuildFeatureVectorForTweet Called"
global char_n_grams_index, word_n_grams_index, hate_words_index
happy, sad, anger, fear, surprise, disgust, hashtags, usernames, \
urls, punctuations_marks_count, repetitive_words, char_n_grams, \
word_n_grams, upper_case_words, intensifiers, negations, tweet_hate_words = pre.PreProcessing(tweet)
#print tweet_hate_words
feature_vector = []
#print char_n_grams_index
#print word_n_grams_index
#feature_vector = AddEmoticonFeatures(feature_vector, happy, sad, disgust, anger, fear, surprise)
feature_vector = AddCharNGramFeatures(feature_vector, char_n_grams_index, char_n_grams)
#feature_vector = AddWordNGramFeatures(feature_vector, word_n_grams_index, word_n_grams)
#feature_vector = AddRepetitiveWordsFeature(feature_vector, repetitive_words)
#feature_vector = AddPunctuationMarksFeature(feature_vector, punctuations_marks_count)
#feature_vector = AddHateWordsFeature(feature_vector, hate_words_index, tweet_hate_words)
#feature_vector = AddUpperCaseWordsFeature(feature_vector, upper_case_words)
#feature_vector = AddIntensifersFeature(feature_vector, intensifiers)
#feature_vector = AddNegationsFeature(feature_vector, negations)
return feature_vector
def GetFeatureVector(tweet):
#print "GetFeatureVector Called"
global char_n_grams_index, word_n_grams_index, hate_words_index
file = open('pickle_data.txt', "rb")
data = []
# If file reaches the EOL while reading this will reset and the reading
# will start from beginning
file.seek(0)
for i in xrange(pickle.load(file)):
data.append(pickle.load(file))
char_n_grams_index, word_n_grams_index, hate_words_index = data
file.close()
feature_vector = BuildFeatureVectorForTweet(tweet)
return feature_vector
def FeatureVectorDictionary(tweet_mapping):
feature_vector_dict = {}
for key, tweet in tweet_mapping.iteritems():
feature_vector_dict[key] = GetFeatureVector(tweet)
return feature_vector_dict
def TrainingData(id_tweet_map, id_class_map):
tweet_feature_vector = []
tweet_class = []
#print "TrainingData Called"
#print "FeatureVectorDictionary Called"
feature_vector_dict = FeatureVectorDictionary(id_tweet_map)
#print "Done"
for key, val in feature_vector_dict.iteritems():
tweet_feature_vector.append(feature_vector_dict[key])
tweet_class.append(id_class_map[key])
return tweet_feature_vector, tweet_class