In [1]:
import pandas as pd
import os
import gzip
from tqdm.autonotebook import tqdm
import hunspell
import re



In [2]:
def en_contraction_removal(text: str) -> str:
    apostrophe_handled = re.sub("’", "'", text)
    # from https://gist.githubusercontent.com/tthustla/74e99a00541264e93c3bee8b2b49e6d8/raw/599100471e8127d6efad446717dc951a10b69777/yatwapart1_01.py
    contraction_mapping = {
                    "i.e.": 'for example',
                    "e.g.": 'for example',
                    "youre": "you are",
                    "youll": "you will",
                    "theyre": "they are", "theyll": "they will",
                    "weve": "we have",
                    "shouldnt": "should not",
                    "dont": "do not",
                    "doesnt": "does not", "doesn": "does not",
                    "didnt": "did not",
                    "wasn": "was not",
                    "arent": "are not", "aren": "are not",
                    "aint": "is not", "isnt": "is not", "isn": "is not",
                    "wouldnt": "would not", "wouldn": "would not",
                    "ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                       "here's": "here is",
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    expanded = ' '.join([contraction_mapping[t.lower()] if t.lower() in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    return expanded

In [3]:
hobj = hunspell.HunSpell('/Library/Spelling/en_US.dic', '/Library/Spelling/en_US.aff')


known_words = ['GMO', 'GMOs', 'Wal-Mart', 'Publix', 'Glyphosate', 'co2', 'Waitrose', '<URL>', 'certifier', 'TLDR', 'Coca~Cola', 'Quora', 'sci-fi']

for w in known_words:
    hobj.add(w)

In [4]:
url_regex = r'(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&\(\)\*\+,;=.]+'

def replace_urls_regex(sentence: str, url_token: str = '<URL>') -> str:
    return re.sub(url_regex, url_token, sentence)

def replace_urls(words: List[str], url_token: str = '<URL>') -> List[str]:
    return [url_token if (w.lower().startswith('www') or w.lower().startswith('http')) else w for w in words]

def spellcheck_sentence(row) -> str:
    sent = row['Sentence']
    #print(sent)
    to_remove = [',', '(', ')', ':', '?', '&', '/']
    for tr in to_remove:
        sent = sent.replace(tr, ' ')
        
    sent = sent.replace('€™', "'")
    sent = sent.replace('�', "'")
    sent = en_contraction_removal(sent)
    sent = sent.replace("'", ' ')

    
    
    tokens = sent.split(' ')
    result = []
    for t in tokens:
        if t == ' ':
            continue
        if not hobj.spell(t):
            suggestions = hobj.suggest(t)
            if not suggestions:
                result.append(t)
            else:
                if suggestions[0] == 'e':
                    result.append(t)
                    continue
                result.append(suggestions[0])
                print(f'{t} -> {suggestions[0]}')
        else:
            result.append(t)
    return ' '.join(result)
        
#spellcheck_sentence('This is a tset with a wong wod. Adn now anotheer one why does this notjn workd')

In [5]:
splits = ['train', 'validation', 'test']
path = os.path.join(os.getcwd(), 'data', 'data', 'organic2019')

for s in splits:
    print('Split: ' + str(s))
    fn = os.path.join(path, s + '.csv')
    df = pd.read_csv(fn, sep='|') #
    df['Sentence'] = df.apply(spellcheck_sentence, axis=1)
    
    fn = os.path.join(path, s + '_sp.csv')
    df.to_csv(fn, sep='|')

Split: train
externalities -> externalizes
Walkin -> Walking
DeKalb -> Kalb
labour -> labor
NPOP -> POP
NOP -> BOP
NPOP -> POP
bestest -> best est
bestest -> best est
amoungst -> amount
enviroments -> environments
enviroments -> environments
knowlege -> knowledge
metioned -> mentioned
deminish -> diminish
monoculture -> mono culture
fertilisers -> fertilizers
‘dead -> dead
labour -> labor
vegitarians -> vegetarians
superbugs -> super bugs
‘catching -> catching
superbug -> super bug
fertilisers -> fertilizers
benifits -> benefits
fradulent -> fraudulent
Muricans -> Americans
buds! -> buds
$3 -> 3
$1 -> 1
embryonated -> embryonic
review406 -> reviewer
macerators -> accelerators
poults -> pouts
Organisation -> Organization
Union.4 -> Union
food[1]. -> food
aflatoxin. -> antitoxin
‚Äúscientists‚Äù -> conscientiousness
supporter‚Äù -> supporter
aren‚Äôt -> aren't
can‚Äôt -> cantata
aren‚Äôt -> aren't
that‚Äôs -> thatch
seedstock -> seed stock
don‚Äôt -> donation
there‚Äôs -> Therese
america

U.S. -> US
U.S. -> US
labelled -> labeled
UHT -> HUT
Darlington -> Darling ton
+6 -> 6
teperature -> temperature
recognised -> recognized
recognises -> recognizes
UAE. -> USE
yeild -> yield
eco -> Eco
craftsmenship. -> craftsmanship
Thoughtscapism -> Thoughtfulness
minimise -> minimize
IPM -> IMP
minimise -> minimize
IPM -> IMP
U.S. -> US
IPM -> IMP
1% -> 1
“Everything -> everything
“natural” -> natural
“natural” -> natural
Bt -> Br
Bt-biotech -> Br-biotech
labour -> labor
eco-friendly. -> Eco-friendly
bioengineering -> bio engineering
certifiers -> rectifiers
Swinney -> Spinney
EatingExpectantly -> Eating Expectantly
nitrous -> nitro us
agriculture—typically -> electromagnetically
agriculture—impose -> agriculturalist
“ammonia -> ammonia
nitrous -> nitro us
systems” -> systems
“land -> land
acidification -> solidification
unit.” -> unity
U.S. -> US
*Not -> not
Eggs*. -> Eggs
*Wow -> wow
Colors! -> Colors
organicness. -> organics
NPOP -> POP
BioSuisse -> Biosensor
certifcatin -> certif

realisation -> realization
NPOF -> NP OF
NCOF -> NCO
RCOFs -> Roofs
Centre -> Center
agri -> agree
fertilisers -> fertilizers
India‚Äôs -> Indianans
tsunamic -> tsunami
Kerala -> Lateral
Tamilnadu. -> Tamil
Kerala -> Lateral
Tamilnadu -> Tamil
andorganic -> and organic
programme -> programmer
Raj -> Ra
Kiran -> Karin
gmo -> GMO
suffocants -> suffocation
U.S. -> US
fertilisers -> fertilizers
$0 -> 0
2+ -> 2
$5 -> 5
silly! -> silly
ecofriendly -> unfriendly
safeway. -> Safeway
U.S. -> US
U.S. -> US
U.S. -> US
runnoff -> runoff
fertilise -> fertilize
fertiliser. -> fertilizer
programme -> programmer
#5 -> 5
#6 -> 6
“non-organic” -> non-organic”
“inorganic”. -> inorganic
“Organic -> organic
harmony.” -> harmony
NOSB -> NOBS
harmony.” -> harmony
NOSB -> NOBS
harmony.” -> harmony
NOSB -> NOBS
Strough -> Trough
Strough -> Trough
vegitables -> vegetables
Strough -> Trough
farmstand -> farm stand
california -> California
florida -> Florida
IOFGA -> FIGARO
eco-system -> Eco-system
food! -> food


Subramanyam -> Subprogram
Bhadriraju -> Hadrian
NASS -> SANS
tond -> tons
tond -> tons
coli -> coil
7billion -> billion
cumulate -> accumulate
grey -> Grey
‘naturally -> naturally
‘synthetically -> synthetically
‘good -> good
1ltr -> ultra
30seconds -> seconds
Aris -> Aries
‘dilute -> dilute
EWG -> EWE
Rosaceae -> Aerospace
neurotoxic -> neurotic
ziram -> Hiram
ziram -> Hiram
neurotoxic -> neurotic
organophosphate -> phosphorylation
Azinphos -> Haziness
flavonoids -> flavoring
lycopene -> glycogen
coli -> coil
coli -> coil
chill! -> chill
Coli -> Colo
is! -> is
bigbasket -> big basket
onekirana. -> crankiness
omega3 -> omega
omega6 -> omega
omega3 -> omega
omega6 -> omega
agrochemicals -> petrochemicals
agrochemicals -> petrochemicals
tldr -> TLDR
tldr -> TLDR
convential -> conventional
eco-agriculture -> Eco-agriculture
dataset -> data set
Badgley -> Bradley
paleoecologist -> paleontologist
kilocalories -> kilo calories
extrenely -> extremely
guage -> gauge
Berkley -> Berkeley
U.S. ->

certififed -> certified
labelling -> labeling
Ambronite -> Maronite
Ambronite -> Maronite
formulationBoth -> formulation Both
Ambronite -> Maronite
Ambronite -> Maronite
Ambronite -> Maronite
sea-buckthorn -> sea-buck thorn
lucuma -> lucubrate
Ambronite -> Maronite
Ambronite. -> Maronite
flaxseed -> flax seed
Ambronite -> Maronite
realised -> realized
Evira -> Evita
Ambronite -> Maronite
V4 -> V
kingTaste -> king Taste
Ambronite -> Maronite
containerAmbronite -> intercontinental
kcal -> cal
Ambronite -> Maronite
forWe -> for We
M.D. -> MED
Ambronite -> Maronite
Soylent -> Soy lent
Huel -> Hull
gullibly -> gullible
BeGood -> Be Good
saftey. -> safety
“organic” -> organic
bioactive -> bio active
CLA -> CAL
uncaged -> uncased
Salatin -> Saladin
Polyface -> Poly face
Farms! -> Farms
ag -> Ag
Geer -> Gere
Waltons -> Walton
oppertunity -> opportunity
oppertunity -> opportunity
oppertunity -> opportunity
expierimenting -> experimenting
expierimental -> experimental
tapwater -> tap water
succe

telugu -> Telugu
Thegalu. -> Megalith
Andhra -> Rand
Pradesh. -> Preshrank
Borassus -> Parnassus
flabellifer. -> flabbergast
Odiyal -> Zodiacal
chewable -> chew able
Thaati -> That
Munjelu -> Muskellunge
Thaati -> That
Munjalu -> Municipal
thati -> that
munjalu -> municipal
Andhra -> Rand
Pradesh -> Preshrank
HarmonyArt -> Harmony Art
GOTS -> TOGS
orthorexic -> orthopedic
“better”. -> better
Canino -> Canine
EWG -> EWE
GOTS -> TOGS
flavour -> flavor
Fysh -> Fish
JAS -> HAS
phd -> pd
Agricultute -> Agriculture
orgnic -> organic
theecostore. -> threescore
U.S. -> US
Tiangyi -> Tianjin
labelled -> labeled
Tiangyi -> Tianjin
labelled -> labeled
NGO -> NO
JUCCCE -> JUICE
9xxx -> xxx
labelled -> labeled
neighbours -> neighbors
95%-organic -> inorganic
delhi. -> Delhi
www.Eatofresh.com -> www. Eatofresh.com
planograms -> sonograms
ribboned -> rib boned
Splenda -> Splendid
NCR -> NOR
Gourmetdelight. -> Gourmet delight
artisanal -> artisan
Gourmetdelight -> Gourmet delight
helps! -> helps
makei

want! -> want
bluebies -> blueberries
ferricberries -> ferric berries
CHNOPS -> CHOPS
carb -> crab
lol. -> lo
helps! -> helps
El -> E
Galpon -> Gallon
Chacarita -> Charity
MRL -> ML
Alimentarius -> Alimentary
lol -> lo
Yee -> See
haven‚Äôt -> haven't
se. -> SE
Composter -> Com poster
pre-packaged -> per-packaged
tha -> they
Kerilan -> Flanker
feilds -> fields
Youtube -> YouTube
Salatin. -> Saladin
too! -> too
organisations -> organizations
doesn -> does
pre-requisite -> per-requisite
Programme -> Programmer
NPOP -> POP
NPOP. -> POP
NPOP. -> POP
Programme -> Programmer
NPOP -> POP
[NAB] -> NAB
NPOP -> POP
[NAB]. -> NAB
NPOP. -> POP
gmo -> GMO
ag -> Ag
NCR -> NOR
celiac -> celeriac
eventualy -> eventual
eventualy -> eventual
ag -> Ag
BT -> TB
BT -> TB
Bt -> Br
Bt -> Br
Bt -> Br
non-Bt -> non-Br
Bt -> Br
appearence -> appearance
bioactive -> bio active
haryana -> harry
haryana -> harry
Iam -> Ian
haryana -> harry
monocultures -> mono cultures
ERS -> RES
Loma -> Lima
colorectal -> correcti

shortFeel -> short Feel
outExclusive -> out Exclusive
Wallmart. -> Walmart
TexasQuality -> Texas Quality
factorSomeone -> factor Someone
factorSomeone -> factor Someone
outFood -> out Food
individualsFinally -> individuals Finally
barIn -> bar In
barIn -> bar In
barIn -> bar In
Albertons -> Alberto's
“health -> health
claims” -> claims
favour -> favor
safe… -> safe
subclinical -> sub clinical
etc...There -> theretofore
{organic -> organic
to} -> to
feaces -> feces
feaces -> feces
fertilisers -> fertilizers
scalable -> salable
se -> SE
labour -> labor
scalability -> availability
“better -> better
environment” -> environment
“it -> it
healthier” -> healthier
“organic” -> organic
“better -> better
environment” -> environment
“healthier” -> healthiness
“healthier” -> healthiness
“non-organic” -> non-organic”
café -> cafe
UAE. -> USE
aflatoxin -> antitoxin
Ochratoxin -> Intoxication
Organicgarden -> Organic garden
sulphur -> sulfur
Sulfite -> Sulfate
Zeroincondotta -> Incontrovertibly
GOPCA

labour -> labor
one! -> one
availbility. -> availability
nurishing -> nourishing
Matcha -> Match
Matcha -> Match
_buy_ -> buy
breasfeeding -> breastfeeding
fertiliser. -> fertilizer
FAO -> OAF
glyphosate -> Glyphosate
glyphosate -> Glyphosate
banglore -> bang lore
coz -> cox
USDA-ERS -> USURERS
non-ag -> non-Ag
haryana -> harry
ag -> Ag
Greenmarket. -> Green market
manufactured.! -> manufacturer
foods!. -> foods
answer! -> answer
.I -> I
fb -> bf
getorganic -> get organic
WhatsApp -> Whats App
colleting -> collecting
Gardeners! -> Gardeners
‚Äúcooler -> cooler
goodies‚Äù -> goodness
mache -> make
mizuna -> mizzen
italian -> Italian
area‚Äôs -> arrears
LGU -> LUG
LGU -> LUG
dropdown -> drop down
website‚Äôs -> websites
‚ÄúFall -> falloff
Gardens‚Äù -> Gardener
You‚Äôll -> You'll
Services! -> Services
Topping! -> Topping
Fall! -> Fall
grey -> Grey
Highend -> High end
sadashiv -> dashiki
nagar -> agar
Indiranagar -> Devanagari
RMV -> RM
Whitefiled -> Whitefield
OLEAtopia -> Proletariat
hu

In [None]:
df[df['Sentence'].isnull()]