# Downloads and Imports

In [2]:
import nltk
import pandas as pd
import urllib.request
import json
from difflib import ndiff
from collections import Counter
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Downloading the Birkbeck data and initialising a Pandas DataFrame

In [None]:
def createData(link, filename = "data/data.dat"):

  urllib.request.urlretrieve(link, filename)
  current_key = None
  result_dict = {}
  repeats = []
  print('About the spellcheck dataset:\n')
  for line in open(filename).readlines():

      lines = line.strip().split('\n')

      if lines[0].startswith('$'):
              current_key = lines[0][1:].lower()
              if(current_key not in result_dict.keys()):
                  result_dict[current_key] = []
              else:
                print('The word "' + current_key + '" occurs more than once')
                repeats.append(current_key)
                #result_dict[current_key].append(lines[0].lower())
              #print(line[1:-2])
              #result_dict[current_key] = []
      else:
              result_dict[current_key].append(lines[0].lower())

  df = pd.DataFrame([result_dict.keys(), result_dict.values()]).T
  df.columns = ['words', 'misspellings']

  return df, repeats

In [206]:
from pandarallel import pandarallel
pandarallel.initialize(nb_workers = 300, progress_bar=True)

INFO: Pandarallel will run on 300 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
import utils.setup

# Update the dataframe

In [201]:
def findingClosestWords(start_index = 0, end_index = 10, df =None, output_file_path = 'spellCheck.json', k_vals = [1,5,10], link =  "https://www.dcs.bbk.ac.uk/~roger/missp.dat"):

  repeats = []
  if (df is None):
    df, repeats = createData(link = link)  #can mention filename and link here
  
  df, starters = findPossibleFirstLetters(df)
  dictionary, word_count = buildDictionary()
  describeData(df, repeats, word_count)
  df1 = df[start_index : end_index]
  df1['k_list'] = df1.parallel_apply(lambda x: find_k(x.words, x.misspellings, dictionary, starters), axis = 1)
  results = getResults(df)
  print('\n')
  success_at_k = getAverageSuccessValues(results, df.num_of_words.sum(), k_vals)
  df1.drop(columns = ['num_of_words'], inplace = True)
  df1.to_json('results//birkbeckResults//full//spellCheck.json')
  with open("results//birkbeckResults//full//evalResults.json", "w") as fp:
    json.dump(success_at_k, fp)
  print('-----------------------------------------------------------------------------------------------------')
  return df1

In [207]:
k_values = findingClosestWords(start_index = 1, end_index = 10)

About the spellcheck dataset:

The word "cambridge" occurs more than once
The word "february" occurs more than once
The word "miss" occurs more than once
The word "monday" occurs more than once
The word "sunday" occurs more than once
The word "wednesday" occurs more than once

There are a total of 6130 unique correct words and 6 repeats which totals to 6136 words
There are a total of 36133 misspelt words in the dataset

About the wordnet dictionary:

There are 147306 words in the wordnet dictionary



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

k =  4 in 2501 searches
k =  1 in 19162 searches
k =  2 in 7151 searches
k =  5 in 1376 searches
k =  3 in 4184 searches
k =  7 in 418 searches
k =  6 in 756 searches
k =  10 in 78 searches
k =  8 in 245 searches
k =  9 in 149 searches
k =  11 in 48 searches
k =  12 in 33 searches
k =  13 in 19 searches
k =  16 in 4 searches
k =  15 in 1 searches
k =  14 in 7 searches
k =  17 in 1 searches


Success at 1 is 53.03 %
Success at 5 is 95.13 %
Success at 10 is 99.69 %
-----------------------------------------------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['k_list'] = df1.parallel_apply(lambda x: find_k(x.words, x.misspellings, dictionary, starters), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(columns = ['num_of_words'], inplace = True)


# Concatenate result files

In [None]:
import os

In [119]:
df = pd.DataFrame()
for json_file in os.listdir('results//birkbeckResults//parts'):
    file_name = 'results//birkbeckResults//parts' + json_file
    df1 = pd.read_json(file_name)
    df = pd.concat([df, df1])

In [218]:
def concatenateParallelRuns(k_vals = [1, 5, 10], num_of_words = 36133):
  df1 = pd.read_json('/content/bb_0_to_1000.json')
  df2 = pd.read_json('/content/bb_1000_to_2000.json')
  df3 = pd.read_json('/content/bb_2000_to_3000.json')
  df4 = pd.read_json('/content/bb_3000_to_4000.json')
  df5 = pd.read_json('/content/bb_4000_to_5000.json')
  df6 = pd.read_json('/content/bb_5000_to_6130.json')
  df = pd.concat([df1, df2, df3, df4,df5, df6])
  results = getResults(df)
  print('\n')
  success_at_k = getAverageSuccessValues(results, num_of_words, k_vals)
  df.to_json('spellCheck.json')
  with open("evalResults.json", "w") as fp:
    json.dump(success_at_k, fp)
  print('-----------------------------------------------------------------------------------------------------')
  return df

In [219]:
concatenateParallelRuns()

k =  4 in 2501 search(es)
k =  1 in 19162 search(es)
k =  2 in 7151 search(es)
k =  5 in 1376 search(es)
k =  3 in 4184 search(es)
k =  7 in 418 search(es)
k =  6 in 756 search(es)
k =  10 in 78 search(es)
k =  8 in 245 search(es)
k =  9 in 149 search(es)
k =  11 in 48 search(es)
k =  12 in 33 search(es)
k =  13 in 19 search(es)
k =  16 in 4 search(es)
k =  15 in 1 search(es)
k =  14 in 7 search(es)
k =  17 in 1 search(es)


Success at 1 is 53.03 %
Success at 5 is 95.13 %
Success at 10 is 99.69 %
-----------------------------------------------------------------------------------------------------


Unnamed: 0,words,misspellings,k_list
0,albert,[ab],[4]
1,america,"[ameraca, amercia]","[1, 1]"
2,american,[ameracan],[1]
3,april,[apirl],[1]
4,austrian,[austrain],[1]
...,...,...,...
6125,february,"[febuary, feburary, feburay, febuary]","[1, 1, 1, 1]"
6126,miss,"[mis, mess, mis, mrs]","[1, 1, 1, 2]"
6127,monday,"[munday, mond]","[1, 2]"
6128,sunday,"[sanday, sonday, sunbay, sundays, suntday]","[1, 1, 1, 1, 1]"


# Trying pytrec

In [152]:
!pip install pytrec_eval

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308218 sha256=3ffce941dc602f962577b5a2528a91ef7ec93bb32ec7d4bcd6c0504b43fa7630
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


In [153]:
import pytrec_eval

In [None]:
pytrec_eval.compute_aggregated_measure([1,5,10], k_values)

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(query, {'success'})
print(json.dumps(evaluator.evaluate(results_eval), indent=1))
eval = evaluator.evaluate(results_eval)

for measure in sorted(list(eval[list(eval.keys())[0]].keys())):
  print(measure, 'average:',pytrec_eval.compute_aggregated_measure(
                  measure, [query_measures[measure] for query_measures in eval.values()])
              )

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(query, {'success'})

In [151]:
19162/36133 * 100

53.031854537403476

In [254]:
len(final[final['num_of_words']>100].iloc[0]['misspellings'])

105