In [1]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import string
from textblob import TextBlob
import gensim
from gensim import corpora
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
from fuzzywuzzy import fuzz
import re
import nltk
from datetime import date
from statistics import mean
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords #To Remove the StopWords like "the","in" ect
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer #lemmatize the word for example "studies" and "studying" will be converted to "study"



In [17]:
# load tensor flow, this takes about 2 minutes
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" # options are: ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)

# define function to get the tensor flow embeddings
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [18]:
### INPUT NEEDED FROM USER

# file with the verbatims
response_file = "cbgwwav2verbatims.csv"
# column name of the respondent ID
id_column = "ï»¿{Case ID}"
# column names from the response file that you want to run the coder on
columns = [
    "O_DESCRIBE"
]

In [19]:
# where the results will be save [YYYMMDD_responsefile_output]
todays_date = date.today().strftime("%Y%m%d")
output_folder = todays_date + "_" + response_file.split(".csv")[0] + "_output/"

In [20]:
for c in columns:
    print("NOW RUNNING: " + c)
    # The code assumes a codeframe file in this directory called "[columnnameofverbtaims]_codeframe.csv"
    codeframe_file = c + "_codeframe.csv"
    column = c
    # output file will be named YYYYMMDD_[columnnameofverbtaims]_coded.csv
    output_file = output_folder + "_" + todays_date + "_" + c + "_coded.csv"
    
    df = pd.read_csv(response_file, encoding = "ISO-8859-1")
    df = df.replace(np.nan, '', regex=True)
    df = df.astype('str')
    responses = df[column].tolist()
    df_response = df[column]
    
    respondent_id = df[id_column]
    
    
    code_frame_df = pd.read_csv(codeframe_file, header = None)
    code_frame_df.index += 1
    code_frame = code_frame_df.iloc[:,0].tolist()
    split_code_frame = [c.split(",") for c in code_frame]
    strip_and_split_code_frame = []
    for code_list in split_code_frame:
        strip = [c.strip() for c in code_list]
        strip.append("".join(code_list))
        strip_and_split_code_frame.append(strip)
    code_frame_embeddings = []
    for cf in strip_and_split_code_frame:
        code_frame_embeddings.append(embed(cf))
    
    dk_na = ["Nothing", "I don't know"]
    exclusive_code_frame_options = embed(dk_na)
    
    
    count = 0
    results = []
    for response in responses:
        count = count + 1
        print(count)
        best_responses = []
        best_responses_scores = []
        
        # want to split up the responses into clauses....to make it easier to split
        # we replace all punctuation that indicates "clause" with a period so we can
        # just split on period later
        response = response.replace(') ', '.')
        response = response.replace(',', '.')
        response = response.replace(';', '.')
        response = response.replace('! ', '.')
        response = response.replace('...', '.')
        response = response.replace('- ', '.')
        response = response.replace('/', '.')
        response = response.replace('&', '.')
        response = response.replace(' and ', '.')
        
        for res in response.split("."):
            res = res.strip()
            response_embeddings = embed([res])
            if (not res) | res.isspace():
                continue
            else:
                max_scores = []
                # loop through the codeframe embeddings
                for i in range(0, len(code_frame)):
                    code_frame_scores = []
                    # get the embedding for the ith codeframe
                    code_frame_embedding = code_frame_embeddings[i]
                    # get the cosine similiary between the response emedding and every codeframe emedding 
                    # cosines is a list of the cosine similatiries where the index corresponds to the
                    # the codeframe indexes
                    # so cosines[0] is cosine similiatry of respnose and code_frame[0]
                    cosines = tf.keras.losses.cosine_similarity(code_frame_embedding, response_embeddings,axis=-1).numpy().tolist()
                    
                    # I'm waiting the cosine score and the fuzzy token sort scores here...honestly
                    # it's a little arbitrary i chose 130 and .70 aka putting more weight on the cosine than the fuzzy sort ratio
                    scores = [(c * -130 +  fuzz.token_sort_ratio(code_frame[i], res) * .70) for c in cosines]
                    
                    # take the max score append to the list
                    max_scores.append(max(scores))
                
                # get the best score's index. -1 could be changed to -2 or whatever 
                # to get the top n scores
                max_best_scores = np.argsort(max_scores)[-1:].tolist()
                
                # loop through top n scores (in the case its just the one score)
                for best_score in max_best_scores:
                    # basically only take the top if it's greater than 75...again
                    # an arbitrary threshold for what's considered "good enough"
                    if max_scores[best_score] > 75:
                        # since code_frame is not zero indexes need to add one
                        code_frame_number = best_score + 1
                        # check to make sure we don't already have that codeframe variable
                        if code_frame_number not in best_responses:
                            best_responses.append(code_frame_number)
                            best_responses_scores.append(max_scores[best_score])
        
        # take top four best responses
        top_best_responses = np.argsort(best_responses_scores)[-4:].tolist()
        response_result = [best_responses[i] for i in top_best_responses]
        
        # basically if the response ins't blank but we couldn't find any matches with the codeframe
        if not response_result and not(not response):
            # well try with the exclusive codeframe options (Which in this case is "none" and "I don't know"
            cosines = tf.keras.losses.cosine_similarity(exclusive_code_frame_options, response_embeddings,axis=-1).numpy().tolist()
            # weighting is a little different here since the chance of it being an exact word match are more likely
            scores = [(c * -122 +  fuzz.token_sort_ratio(dk_na[cosines.index(c)], response)) for c in cosines]
            max_exclusive_score = max(scores)
            #if none of the exclusive options produce a sufficient score, we use "97" which stands for "other"
            if max_exclusive_score < 70:
                response_result = [97]
            else: 
                # otherwise its 98 (none) or 99 (I don't know)
                # none is index at 0 and i don't know is indexed 1 so I just add 98 
                response_result = [scores.index(max_exclusive_score) + 98]
        results.append(response_result)
    
    # create a dataframe of the responses to the results
    d = {'Response':  responses,'Code': results}
    df_results = pd.DataFrame(d)
    output_columns = []
    # create columns for the codes
    for i in range(1, max([len(x) for x in results]) + 1):
        output_columns.append("Code" + str(i))
    new_df = pd.DataFrame(df_results["Code"].to_list(), columns=output_columns)
    new_df['Response'] = df_results.Response
    # add respondent ID 
    new_df.insert(0, 'RESPONDENT_ID', respondent_id)
    # output csv
    new_df.to_csv(output_file)
    print("DONE WITH: " + c + "\n\n")     
        

NOW RUNNING: O_DESCRIBE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271


FileNotFoundError: [Errno 2] No such file or directory: '20210408_cbgwwav2verbatims_output/_20210408_O_DESCRIBE_coded.csv'