-
Notifications
You must be signed in to change notification settings - Fork 3
/
Test.php
533 lines (420 loc) · 15.6 KB
/
Test.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
<?php
// This is our Tokenize function from
function Tokenize($text, $delimiters, $compound_word_symbols, $contraction_symbols){
$temp = ''; // A temporary string used to hold incomplete lexemes
$lexemes = array(); // Complete lexemes will be stored here for return
$chars = str_split($text, 1); // Split the text sting into characters.
// Step through all character tokens in the $chars array
foreach($chars as $key=>$char){
// If this $char token is in the $delimiters array
// Then stop building $temp and add it and the delimiter to the $lexemes array
if(in_array($char, $delimiters)){
// Does temp contain data?
if(strlen($temp) > 0){
// $temp is a complete lexeme add it to the array
$lexemes[] = $temp;
}
$temp = ''; // Make sure $temp is empty
$lexemes[] = $char; // Capture delimiter as a whole lexeme
}
else{// This $char token is NOT in the $delimiters array
// Add $char to $temp and continue to next $char
$temp .= $char;
}
} // Step through all character tokens in the $chars array
// Check if $temp still contains any residual lexeme data?
if(strlen($temp) > 0){
// $temp is a complete lexeme add it to the array
$lexemes[] = $temp;
}
// We have processed all character tokens in the $chars array
// Free the memory and garbage collect $chars & $temp
$chars = NULL;
$temp = NULL;
unset($chars);
unset($temp);
// We now have the simplest lexems extracted.
// Next we need to recombine compound-words, contractions
// And do any other processing with the lexemes.
// If there are $chars in the $compound_word_symbols array
if(!empty($compound_word_symbols)){
// Count the number of $lexemes
$number_of_lexemes = count($lexemes);
// Step through all lexeme tokens in the $lexemes array
foreach($lexemes as $key=>&$lexeme){
// Check if $lexeme is in the $compound_word_symbols array
if(in_array($lexeme, $compound_word_symbols)){
// If this isn't the first $lexeme in $lexemes
if($key > 0){
// Check the $lexeme $before this
$before = $lexemes[$key - 1];
// If $before isn't a $delimiter
if(!in_array($before, $delimiters)){
// Merge it with the compound symbol
$lexeme = $before . $lexeme;
// And remove the $before $lexeme from $lexemes
$lexemes[$key - 1] = NULL;
}
}
// If this isn't the last $lexeme in $lexemes
if($key < $number_of_lexemes){
// Check the $lexeme $after this
$after = $lexemes[$key + 1];
// If $after isn't a $delimiter
if(!in_array($after, $delimiters)){
// Merge the $lexeme it with
$lexemes[$key + 1] = $lexeme . $after;
// And remove the $lexeme
$lexeme = NULL;
}
}
} // Check if lexeme is in the $compound_word_symbols array
} // Step through all tokens in the $lexemes array
} // If there are $chars in the $compound_word_symbols array
// Filter out any NULL values in the $lexemes array
// created during the compound word merges using array_filter()
// and then re-index so the $lexemes array is nice and sorted using array_values().
$lexemes = array_values(array_filter($lexemes));
// If there are $chars in the $contraction_symbols array
if(!empty($contraction_symbols)){
// Count the number of $lexemes
$number_of_lexemes = count($lexemes);
// Step through all lexeme tokens in the $lexemes array
foreach($lexemes as $key=>&$lexeme){
// Check if $lexeme is in the $contraction_symbols array
if(in_array($lexeme, $contraction_symbols)){
// If this isn't the first $lexeme in $lexemes
// and If this isn't the last $lexeme in $lexemes
if($key > 0 && $key < $number_of_lexemes){
// Check the $lexeme $before this
$before = $lexemes[$key - 1];
// Check the $lexeme $after this
$after = $lexemes[$key + 1];
// If $before isn't a $delimiter
// and $after isn't a $delimiter
if(!in_array($before, $delimiters) && !in_array($after, $delimiters)){
// Merge the contraction tokens
$lexemes[$key + 1] = $before . $lexeme . $after;
// Remove $before
$lexemes[$key - 1] = NULL;
// And remove this $lexeme
$lexeme = NULL;
}
}
} // Check if lexeme is in the $contraction_symbols array
} // Step through all tokens in the $lexemes array
} // If there are $chars in the $contraction_symbols array
// Filter out any NULL values in the $lexemes array
// created during the contraction merges using array_filter()
// and then re-index so the $lexemes array is nice and sorted using array_values().
$lexemes = array_values(array_filter($lexemes));
// Return the $lexemes array.
return $lexemes;
} // Tokenize()
// Remove unwanted Delimiters or symbols from Lexems array
function Remove($lexemes, $remove_values){
foreach($lexemes as &$lexeme){
// if the lexeme is one that should be removed
if(in_array($lexeme, $remove_values)){
$lexeme = NULL; // set it to null
}
}
// Remove NULL, FALSE & "" but leaves values of 0 (zero)
$lexemes = array_filter( $lexemes, 'strlen' );
return array_values($lexemes);
}
// This takes an array of lexemes produced by the Tokenize() function
// and returns an associative array containing tri-grams, bi-grams and skip-grams
function ExtractGrams($lexemes, $hash = true){
$grams = array();
$lexeme_count = count($lexemes);
for($i=2; $i < $lexeme_count; $i++){
if($hash == true){// hashed string - default
$grams['trigrams'][] = hash('md5', $lexemes[$i-2] . $lexemes[$i-1] . $lexemes[$i]);
$grams['skipgrams'][] = hash('md5', $lexemes[$i-2] . $lexemes[$i]);
}
else{// unhashed string
$grams['trigrams'][] = $lexemes[$i-2] . $lexemes[$i-1] . $lexemes[$i];
$grams['skipgrams'][] = $lexemes[$i-2] . $lexemes[$i];
}
}
for($i=1; $i < $lexeme_count; $i++){
if($hash == true){// hashed string - default
$grams['bigrams'][] = hash('md5', $lexemes[$i-1] . $lexemes[$i]);
}
else{// unhashed string
$grams['bigrams'][] = $lexemes[$i-1] . $lexemes[$i];
}
}
return $grams;
}
// MySQL Server Credentials
$server = 'localhost';
$username = 'root';
$password = 'password';
$db = 'PartsOfSpeechTagger';
// Create connection
$conn = new mysqli($server, $username, $password, $db);
// Check connection
if ($conn->connect_error) {
die("MYSQL DB Connection failed: " . $conn->connect_error);
}
// Delimiters (Lexeme Boundaries)
$delimiters = array('~', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '`', '-', '=', '{', '}', '[', ']', '\\', '|', ':', ';', '"', '\'', '<', '>', ',', '.', '?', '/', ' ', "\t", "\n");
// Symbols used to detect compound-words
$compound_word_symbols = array('-', '_');
// Symbols used to detect contractions
//$contraction_symbols = array("'", '.', '@');
$contraction_symbols = array("'", '@');
// The text we want to tag
$text = 'The quick brown fox jumps over the lazy dog. A long-term contract with "zero-liability" protection! Let\'s think it over.';
// Tokenize and extract the $lexemes from $text
$lexemes = Tokenize($text, $delimiters, $compound_word_symbols, $contraction_symbols);
// Filter unwanted lexemes, in this case, we want to remove spaces since
// the Brown Corpus doesn't use them and we don't really need them for anything.
$lexemes = Remove($lexemes, array(' '/*, Add other values to remove here*/));
// Extract the Lexemes into Bi-grams, Tri-grams & Skip-grams
// using the new ExtractGrams() function
$grams = ExtractGrams($lexemes);
// Lookup all the grams using their hashes to simplify and speedup
// the queries due to reduced number of field comparisons.
foreach($grams as $skey=>&$gramset){
foreach($gramset as $gkey=>&$gram){
if($skey == 'trigrams'){
$sql = "SELECT * FROM `Trigrams` WHERE `Hash` = '$gram' ORDER BY `Count` DESC";
}
elseif($skey == 'bigrams'){
$sql = "SELECT * FROM `Trigrams` WHERE `Hash_AB` = '$gram' OR `Hash_BC` = '$gram' ORDER BY `Count` DESC";
}
elseif($skey == 'skipgrams'){
$sql = "SELECT * FROM `Trigrams` WHERE `Hash_AC` = '$gram' ORDER BY `Count` DESC";
}
$gram = array('hash'=>$gram, 'sql'=>$sql);
$result = $conn->query($gram['sql']);
$gram['data'] = array();
if($result->num_rows > 0){
// Collect the data for this gram result
while($row = mysqli_fetch_assoc($result)) {
$gram['data'][] = array(
'Hash'=> $row["Hash"],
'Count'=> $row["Count"],
'Word_A'=> $row["Word_A"],
'Word_B'=> $row["Word_B"],
'Word_C'=> $row["Word_C"],
'Tag_A'=> $row["Tag_A"],
'Tag_B'=> $row["Tag_B"],
'Tag_C'=> $row["Tag_C"]);
}
}
}
}
// Get a list of Unique lexemes
$unique_lexemes = array_keys(array_count_values($lexemes));
// Process the gram data for each word
foreach($grams as $skey=>&$gramset){
foreach($gramset as $gkey=>&$gram){
foreach($gram['data'] as $data){
// If the word being considered is one we're looking for
// collect the tag and increment it's value
if(in_array($data['Word_A'], $unique_lexemes)){
@$unique_lexemes[$data['Word_A']][$data['Tag_A']]++;
}
if(in_array($data['Word_B'], $unique_lexemes)){
@$unique_lexemes[$data['Word_B']][$data['Tag_B']]++;
}
if(in_array($data['Word_C'], $unique_lexemes)){
@$unique_lexemes[$data['Word_C']][$data['Tag_C']]++;
}
}
}
}
// Organize the data a little better and calculate the tag score
foreach ($unique_lexemes as $key => &$value)
{
// remove the strings in the numeric indexes
if(is_numeric($key)){
unset($unique_lexemes[$key]);
}
else{// this array index is associate
// sort the tags and compute %
arsort($value);
$sum = array_sum($value);
foreach($value as $tag=>&$score){
$score = $score . ' : ' . ($score/$sum * 100) . '%';
}
}
}
// Merge unique lexemes (with tag data) into the lexemes
$in_citation = false;
foreach($lexemes as $key=>$lexeme){
// If we have a tag for the word
if(array_key_exists($lexeme, $unique_lexemes)){
$lexemes[$key] = array('lexeme'=>$lexeme, 'tags'=> $unique_lexemes[$lexeme]);
}else{
// No Bi-gram, Skip-gram or Tri-gram
// Try to look up the Unigram
$sql = "SELECT * FROM `Words` WHERE `Word` = '$lexeme'";
$result = $conn->query($sql);
if($result->num_rows > 0){// We know this Uni-gram
// Collect the tags for the Uni-gram
while($row = mysqli_fetch_assoc($result)) {
// Decode Uni-gram tags from json into associtive array
$tags = json_decode($row["Tags"], 1);
// Sort the tags and compute %
arsort($tags);
$sum = array_sum($tags);
foreach($tags as $tag=>&$score){
$score = $score . ' : ' . ($score/$sum * 100) . '%';
}
$lexemes[$key] = array('lexeme'=>$lexeme, 'tags'=> $tags);
}
}else{ // We don't know this Uni-gram/word
// Could it be a tag like punctuation or something?
// check if lexeme is a quote and convert to open or closed unigram/tag
$quotes = array('"', "''", "``");
if(in_array($lexeme, $quotes) ){// is this a quote/citation?
if($in_citation == true){
// this is a close quote
$lexeme = "''";
$in_citation = false;
}else{
// this is an open quote
$lexeme = "``";
$in_citation = true;
}
}
// It's basiclly "do or die" at this point and if the lexeme
// can't be located after this it's time to give up and call it an unknown
// lexeme/tag
$l = mysqli_real_escape_string($conn, $lexeme);
$sql = "SELECT * FROM `Tags` WHERE `Tag` = '$l'";
$result = $conn->query($sql);
if($result->num_rows > 0){// We know this Uni-gram
// Collect the tags for the Uni-gram
while($row = mysqli_fetch_assoc($result)){
$lexemes[$key] = array('lexeme'=>$lexeme, 'tags'=>array($row["Tag"]=>$row["Count"] . ' : 100%'));
}
}else{ // We don't know this Tag
$lexemes[$key] = array('lexeme'=>$lexeme, 'tags'=> array('unk'=>'1 : 100%'));
}
}
}
}
$conn->close(); // disconnect from the database
// Echo Original Sentence
echo 'Sentence: ' . $text . PHP_EOL;
echo PHP_EOL;
// Echo the Tagged Sentence
$unique_tags = array();
echo 'Tagged Sentence: ';
foreach($lexemes as $key=>$lexeme){
$tag = key($lexeme['tags']);
echo $lexeme['lexeme'] . '/' . $tag . ' ';
@$unique_tags[$tag]++;
}
echo PHP_EOL . PHP_EOL;
// Echo the Basic Tags report
echo 'Tags: ' . PHP_EOL;
arsort($unique_tags);
$sum = array_sum($unique_tags);
echo count($unique_tags) . " unique tags, $sum total." . PHP_EOL;
foreach($unique_tags as $tag=>$count){
echo "$tag($count) - " . number_format($count/$sum * 100, 2) . '% of the sentence.' . PHP_EOL;
}
echo PHP_EOL . PHP_EOL;
// Echo the Detailed Tags report
echo 'Detailed Report: ' . PHP_EOL;
foreach($lexemes as $key=>$lexeme){
echo '[' . $lexeme['lexeme'] . ']'. PHP_EOL;
$tags = '';
foreach ($lexeme['tags'] as $tag=>$value){
$tags .= "$tag($value)" . PHP_EOL;
}
echo trim($tags) . ' ' . PHP_EOL . PHP_EOL;
}
/*
* Results
*
*
Sentence: The quick brown fox jumps over the lazy dog. A long-term contract with "zero-liability" protection! Let's think it over.
Tagged Sentence: The/at quick/jj brown/jj fox/np jumps/nns over/in the/at lazy/jj dog/nn ./. A/at long-term/nn contract/vb with/in ``/`` zero-liability/unk ''/'' protection/nn-hl !/. Let's/vb+ppo think/vb it/ppo over/in ./.
Tags:
14 unique tags, 24 total.
at(3) - 12.50% of the sentence.
jj(3) - 12.50% of the sentence.
in(3) - 12.50% of the sentence.
.(3) - 12.50% of the sentence.
nn(2) - 8.33% of the sentence.
vb(2) - 8.33% of the sentence.
np(1) - 4.17% of the sentence.
nns(1) - 4.17% of the sentence.
``(1) - 4.17% of the sentence.
unk(1) - 4.17% of the sentence.
''(1) - 4.17% of the sentence.
nn-hl(1) - 4.17% of the sentence.
vb+ppo(1) - 4.17% of the sentence.
ppo(1) - 4.17% of the sentence.
Detailed Report:
[The]
at(3 : 100%)
[quick]
jj(1 : 100%)
[brown]
jj(2 : 100%)
[fox]
np(6 : 66.666666666667%)
nn-tl(3 : 33.333333333333%)
[jumps]
nns(1 : 100%)
[over]
in(522 : 81.5625%)
rp(114 : 17.8125%)
in-hl(4 : 0.625%)
[the]
at(539 : 100%)
[lazy]
jj(27 : 100%)
[dog]
nn(22 : 100%)
[.]
.(1599 : 98.764669549104%)
.-hl(20 : 1.2353304508956%)
[A]
at(1380 : 97.802976612332%)
at-hl(26 : 1.8426647767541%)
nn(2 : 0.1417434443657%)
np-hl(2 : 0.1417434443657%)
at-tl-hl(1 : 0.070871722182849%)
[long-term]
nn(1 : 100%)
[contract]
vb(2 : 50%)
nn(2 : 50%)
[with]
in(6 : 85.714285714286%)
rb(1 : 14.285714285714%)
[``]
``(8837 : 100%)
[zero-liability]
unk(1 : 100%)
['']
''(8789 : 100%)
[protection]
nn-hl(10 : 100%)
[!]
.(6 : 100%)
[Let's]
vb+ppo(9 : 100%)
[think]
vb(31 : 100%)
[it]
ppo(150 : 71.428571428571%)
pps(60 : 28.571428571429%)
[over]
in(522 : 81.5625%)
rp(114 : 17.8125%)
in-hl(4 : 0.625%)
[.]
.(1599 : 98.764669549104%)
.-hl(20 : 1.2353304508956%)
*/