In [2]:
import pandas as pd
import numpy
import ollama
import os

from pydub import AudioSegment

In [9]:
# this notebook runs the initial evaluation of the LLM running against the dataset of audio calls

# Test the dataset and pull the call metadata

# construct data

non_malicious_calls = []

# read openings from CallHomeOpenings folder

for filename in os.listdir('CallHomeOpenings'):
    if filename.endswith(".wav"):
        audio = AudioSegment.from_file('CallHomeOpenings/' + filename)
        non_malicious_calls.append({
            "filename": filename,
            "path": 'CallHomeOpenings/' + filename,
            "audio": audio,
            "dataset": "CallHomeOpenings"
        })

for filename in os.listdir('Business/wav_trim'):
    if filename.endswith(".wav"):
        audio = AudioSegment.from_file('Business/wav_trim/' + filename)
        non_malicious_calls.append({
            "filename": filename,
            "path": 'Business/wav_trim/' + filename,
            "audio": audio,
            "dataset": "Business"
        })

for filename in os.listdir('Synthetic_daily_dialogs/wav'):
    if filename.endswith(".wav"):
        audio = AudioSegment.from_file('Synthetic_daily_dialogs/wav/' + filename)
        non_malicious_calls.append({
            "filename": filename,
            "path": 'Synthetic_daily_dialogs/wav/' + filename,
            "audio": audio,
            "dataset": "DailyDialog"
        })

len(non_malicious_calls), non_malicious_calls


(1169,
 [{'filename': 'audio_127.wav',
   'path': 'CallHomeOpenings/audio_127.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x2881bec50>,
   'dataset': 'CallHomeOpenings'},
  {'filename': 'audio_70.wav',
   'path': 'CallHomeOpenings/audio_70.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x2883f79d0>,
   'dataset': 'CallHomeOpenings'},
  {'filename': 'audio_133.wav',
   'path': 'CallHomeOpenings/audio_133.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x1068c6b50>,
   'dataset': 'CallHomeOpenings'},
  {'filename': 'audio_64.wav',
   'path': 'CallHomeOpenings/audio_64.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x1068aa650>,
   'dataset': 'CallHomeOpenings'},
  {'filename': 'audio_58.wav',
   'path': 'CallHomeOpenings/audio_58.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x2884106d0>,
   'dataset': 'CallHomeOpenings'},
  {'filename': 'audio_59.wav',
   'path': 'CallHomeOpenings/audio_59.wav',
   'audio': <pydub.audio_segment.AudioSegment at 

In [43]:
# generate malicious calls 
malicious_calls = []

df = pd.read_csv("robocall_26kHz_metadata.csv")

eng_robocalls = df[df["language"] == "en"]

# convert into similar dict list as non_malicious_calls
for index, row in eng_robocalls.iterrows():
    audio = AudioSegment.from_file(row["file_name"])
    malicious_calls.append({
        "filename": row["file_name"],
        "path": row["file_name"],
        "audio": audio,
        "dataset": "Robocall"
    })

len(malicious_calls), malicious_calls

(1378,
 [{'filename': 'audio-wav-16khz/1112259_normalized.wav',
   'path': 'audio-wav-16khz/1112259_normalized.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x289c06650>,
   'dataset': 'Robocall'},
  {'filename': 'audio-wav-16khz/58345_normalized.wav',
   'path': 'audio-wav-16khz/58345_normalized.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x28acc0150>,
   'dataset': 'Robocall'},
  {'filename': 'audio-wav-16khz/936704_normalized.wav',
   'path': 'audio-wav-16khz/936704_normalized.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x288192390>,
   'dataset': 'Robocall'},
  {'filename': 'audio-wav-16khz/1100696_normalized.wav',
   'path': 'audio-wav-16khz/1100696_normalized.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x2884f0b50>,
   'dataset': 'Robocall'},
  {'filename': 'audio-wav-16khz/807777_normalized.wav',
   'path': 'audio-wav-16khz/807777_normalized.wav',
   'audio': <pydub.audio_segment.AudioSegment at 0x16d3fc750>,
   'dataset': 'Robocall'},
 

In [44]:


import speech_recognition as sr

import json

# run eval on the non-malicious calls
r = sr.Recognizer()

opts = {
    "temperature": 0.3,
    }

# create a checkpoint file that updates every 10 calls


model_strings = ['llama3.2:1b', 'llama3.2:3b', 'phi3:3.8b', 'gemma:2b']

for model_id in model_strings:
  results = []
  checkpoint = 0
  checkpoint_file = open("checkpoint.txt", "w")

  for call in malicious_calls:
      message = ""
      # run asr on the wav file
      with sr.AudioFile(call["path"]) as source:
          # ensure source is proper mono channel audio, 16kHz
          audio = r.record(source)      
          message = r.recognize_sphinx(audio)

      if message == "":
          continue

      response = ollama.chat(model=model_id, messages=[
        {
          'role': 'user',
          'content': "Is the message spam or phishing? Ignore things \
          like capitalization and spelling when making your decision, \
          this is an Automatic Speech Recognition \
          transcript obtained from a phone call, so errors like this are not made  \
          by the sender of the message \
          If you believe it spam or phishing, then write (call: block) or (call: allow) if \
          it is not. Message: {}".format(message),
        },
      ], options=opts)

      results.append( {
        "entry": {
          "filename": call["filename"],
          "dataset": call["dataset"],
          "message": message
        },
        "response_{}".format(model_id): response,
      })

      checkpoint += 1

      if checkpoint % 5 == 0:
          checkpoint_file.write(json.dumps(results))
          checkpoint_file.flush()

      print(checkpoint)

  # dump results to json file
  with open("malicious_results_{}.json".format(model_id), "w") as f:
      f.write(json.dumps(results))



    
json.dumps(results)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277




In [54]:
df = pd.DataFrame.from_dict(results) 
df.to_csv("pretrained_model_results.csv")

In [61]:
df["llama3.2:1b response"].apply(lambda x: ("block" in x) or ("likely spam" in x) )

0     False
1     False
2     False
3     False
4     False
      ...  
67    False
68     True
69    False
70     True
71    False
Name: llama3.2:1b response, Length: 72, dtype: bool

In [9]:
# install pydub for streaming audio
!pip install -Uq pydub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [45]:

# Reading a scam file from Google Gemini

gemini_audio_only_response.text


'(call: block)\n'