In [None]:
'''
Florida International Univeristy - Data Science MS
CAP 5640 - NLP - Spring 2019
Andrea Garcia and Constanza Schubert

File to perform naive spelling correction on certain terms from the tokenized tweets.

'''

In [213]:
#imports
import pandas as pd
from ast import literal_eval

from spellchecker import SpellChecker
from symspellpy.symspellpy import SymSpell, Verbosity

import requests
import json
import re

In [134]:
#Import the data
apple_df=pd.read_csv('.\\Data\\apple_tokenized.csv', usecols=['text','Hashtags', 'pos_tags','tweet_no_stop'])

# Convert entire column to a list
#apple_df.loc[:,'tokenized_tweets'] = apple_df.loc[:,'tokenized_tweets'].apply(lambda x: literal_eval(x))
apple_df.loc[:,'pos_tags'] = apple_df.loc[:,'pos_tags'].apply(lambda x: literal_eval(x))
apple_df.loc[:,'tweet_no_stop'] = apple_df.loc[:,'tweet_no_stop'].apply(lambda x: literal_eval(x))
apple_df.loc[:,'Hashtags'] = apple_df.loc[:,'Hashtags'].apply(lambda x: literal_eval(x))

In [135]:
apple_df.head()

Unnamed: 0,text,Hashtags,pos_tags,tweet_no_stop
0,Ok guys its time for Dimonds available now in ...,"[cydia, theme, ios, jailbreak, anemone, iPhone...","[(Ok, NNP), (guys, VBZ), (its, PRP$), (time, N...","[ok, guys, time, dimonds, available, repo, cyd..."
1,It makes me chuckle when articles claim that t...,[],"[(It, PRP), (makes, VBZ), (me, PRP), (chuckle,...","[makes, chuckle, articles, claim, versa, compe..."
2,This was pretty cool! Thank you Apple for havi...,[todayatapple],"[(This, DT), (was, VBD), (pretty, RB), (cool, ...","[pretty, cool, thank, thank, everyone, came, m..."
3,Bulletinatomic Potus Davos Wef Energy Yearsofl...,"[HSS, LRAD]","[(HSS, NNP), (LRAD, NNP), (PGA, NNP), (Dairy, ...","[hss, lrad, pga, dairy, us, eu, money, game, b..."
4,I m pretty sure I just discovered that Familyg...,"[FamilyGuy, Apple, ApplePencil]","[(I, PRP), (m, VBP), (pretty, JJ), (sure, JJ),...","[pretty, sure, discovered, predicted, apple, p..."


## Reduce number of words to check spelling of

In [136]:
#Excluding: Twitter handles, hashtags
#capitalized words, NNP

#remove pronouns, conjunctions, prepositions, determiners, etc
#only keep nouns, verbs, adjectives, adverbs


In [137]:
#Finalize list of words to check for spelling

def word_list(df):
    
    #Additional list of terms I do not want to be flagged

    #All terms excluding hashtags
    all_terms = df.apply(lambda row: list(set(row['tweet_no_stop'])-set(row['Hashtags'])),axis=1)
    
    #Keep only nouns, verbs, adjectives, and adverbs
    #JJ, JJR, JJS, NN, NNS, RB, RBR, RBS, VB, VBD, VBG, VBN, VBP, VBZ
    keep={'JJ','JJR','JJS','NN','NNS', 'RB', 'RBR', 'RBS', 'VB','VBD','VBG','VBN','VBP','VBZ'}
    check_words=df['pos_tags'].apply(lambda x: [item[0] for item in x if item[1] in keep ])
    #Intersection between all_terms and check_words
    d = pd.DataFrame({'all terms':all_terms,'check words':check_words})
    final_words = d.apply(lambda row: list(set(row['all terms']).intersection(set(row['check words']))),axis=1 )
    df['check words']=final_words
    return df

In [138]:
apple_df = word_list(apple_df)

In [139]:
apple_df.head()

Unnamed: 0,text,Hashtags,pos_tags,tweet_no_stop,check words
0,Ok guys its time for Dimonds available now in ...,"[cydia, theme, ios, jailbreak, anemone, iPhone...","[(Ok, NNP), (guys, VBZ), (its, PRP$), (time, N...","[ok, guys, time, dimonds, available, repo, cyd...","[guys, time, repo, available]"
1,It makes me chuckle when articles claim that t...,[],"[(It, PRP), (makes, VBZ), (me, PRP), (chuckle,...","[makes, chuckle, articles, claim, versa, compe...","[compete, years, chuckle, articles, makes, ver..."
2,This was pretty cool! Thank you Apple for havi...,[todayatapple],"[(This, DT), (was, VBD), (pretty, RB), (cool, ...","[pretty, cool, thank, thank, everyone, came, m...","[came, production, everyone, songwriting, much..."
3,Bulletinatomic Potus Davos Wef Energy Yearsofl...,"[HSS, LRAD]","[(HSS, NNP), (LRAD, NNP), (PGA, NNP), (Dairy, ...","[hss, lrad, pga, dairy, us, eu, money, game, b...","[bankrupt, game, filtering, take]"
4,I m pretty sure I just discovered that Familyg...,"[FamilyGuy, Apple, ApplePencil]","[(I, PRP), (m, VBP), (pretty, JJ), (sure, JJ),...","[pretty, sure, discovered, predicted, apple, p...","[discovered, sure, predicted, pretty]"


In [140]:
#sample data for testing
df=apple_df[0:10]

# Spell Checker approaches
***

## Norvig approach - pyspellchecker

In [218]:
def norvig_spell(text):
    #spellcheck tokenized text including stop words
    spell = SpellChecker()
    misspelled = spell.unknown(text)
    candidate_corrections = []
    for word in misspelled:
        candidate_corrections.append({word:list(spell.candidates(word))})
        #candidate_corrections.append(spell.candidates(word))
    return candidate_corrections

Test on small sample

In [220]:
%%time
df['check words'].apply(norvig_spell)

Wall time: 2.98 s


0    []
1    []
2    []
3    []
4    []
5    []
6    []
7    []
8    []
9    []
Name: check words, dtype: object

In [None]:
# %time
# #apply norvig spellchecker
# #takes about an hour
# apple_df['check words'].apply(norvig_spell)

Wall time: 0 ns


## Symspell

In [214]:
def sym_spellcheck(input_term):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = '.\\Data\\frequency_dictionary_en_82_765.txt'
                                   
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # lookup suggestions for single-word input strings
    #input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    result = []
    for suggestion in suggestions:
        result.append(suggestion.term)
#         result.append(suggestion.distance)
#         result.append(suggestion.count)
        
            
    
    return result


In [221]:
%%time
sym_spellcheck("members")

Wall time: 7.76 s


['members']

Test on small sample

In [216]:
%%time
df['check words'].apply(lambda x: [sym_spellcheck(y) for y in x])

Wall time: 4min 47s


0    [[guys], [time], [rep, reno, reps, redo, repro...
1    [[compete], [years], [chuckle], [articles], [m...
2    [[came], [production], [everyone], [songwritin...
3            [[bankrupt], [game], [filtering], [take]]
4        [[discovered], [sure], [predicted], [pretty]]
5                                           [[louder]]
6                                                   []
7    [[husband], [access], [photos], [rules], [man]...
8                                           [[louder]]
9                                                   []
Name: check words, dtype: object

## Bing Spell Check API
***

In [209]:
def bing_spell(wl):
    api_key = "052dae11302b42e58ad8ddb3b72af085"
    endpoint = "https://api.cognitive.microsoft.com/bing/v7.0/SpellCheck"
    
    params = {
    'mkt':'en-us',
    'mode':'proof'
    }
    headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'Ocp-Apim-Subscription-Key': api_key,
    }
    
    #If text_str is empty - SKIP ALL THIS
    r = {}
    if not []:
        return r

    else:
        text_str = ' '.join(wl) #text to be spell checked
        data = {'text': text_str}

        response = requests.post(endpoint, headers=headers, params=params, data=data)
        json_response = response.json()
        
        #output
       
        for i in json_response['flaggedTokens']:
        #get most suggestion with highest score
            r[i['token']] = i['suggestions'][0]['suggestion']
    
    return r

In [210]:
# Implement suggested word changes

def correct_text(suggestions,text):
    #suggestions is a dictionary in the form of {"condition1": "", "condition2": "text"}
    corrected_text = text
    #replacement into text string
    if not {}:
        return corrected_text
    
    else:
        pattern = re.compile("|".join(suggestions.keys()))
        corrected_text = pattern.sub(lambda m: suggestions[re.escape(m.group(0))], text)
        
    return corrected_text

Test on small sample

In [211]:
#Implement BING Spell Checker API
df['Bing suggestions'] = df['check words'].apply(lambda x: bing_spell(x))
#Get corrected text
df['Spell-checked text'] = df.apply(lambda row: correct_text(row['Bing suggestions'],row['text']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [212]:
df

Unnamed: 0,text,Hashtags,pos_tags,tweet_no_stop,check words,Bing suggestions,Spell-checked text
0,Ok guys its time for Dimonds available now in ...,"[cydia, theme, ios, jailbreak, anemone, iPhone...","[(Ok, NNP), (guys, VBZ), (its, PRP$), (time, N...","[ok, guys, time, dimonds, available, repo, cyd...","[guys, time, repo, available]",{},Ok guys its time for Dimonds available now in ...
1,It makes me chuckle when articles claim that t...,[],"[(It, PRP), (makes, VBZ), (me, PRP), (chuckle,...","[makes, chuckle, articles, claim, versa, compe...","[compete, years, chuckle, articles, makes, ver...",{},It makes me chuckle when articles claim that t...
2,This was pretty cool! Thank you Apple for havi...,[todayatapple],"[(This, DT), (was, VBD), (pretty, RB), (cool, ...","[pretty, cool, thank, thank, everyone, came, m...","[came, production, everyone, songwriting, much...",{},This was pretty cool! Thank you Apple for havi...
3,Bulletinatomic Potus Davos Wef Energy Yearsofl...,"[HSS, LRAD]","[(HSS, NNP), (LRAD, NNP), (PGA, NNP), (Dairy, ...","[hss, lrad, pga, dairy, us, eu, money, game, b...","[bankrupt, game, filtering, take]",{},Bulletinatomic Potus Davos Wef Energy Yearsofl...
4,I m pretty sure I just discovered that Familyg...,"[FamilyGuy, Apple, ApplePencil]","[(I, PRP), (m, VBP), (pretty, JJ), (sure, JJ),...","[pretty, sure, discovered, predicted, apple, p...","[discovered, sure, predicted, pretty]",{},I m pretty sure I just discovered that Familyg...
5,louder: OH MY GOD THE GOVERNMENT's SETTLED ST...,[],"[(louder, NN), (OH, NNP), (MY, NNP), (GOD, NNP...","[louder, god, government, settled, status, dig...",[louder],{},louder: OH MY GOD THE GOVERNMENT's SETTLED ST...
6,Commissioned by Apple A Great Day In Accra Se...,"[ShotOniPhone, Ghana, Music, Africa, Hiplife, ...","[(Commissioned, VBN), (by, IN), (A, DT), (Grea...","[commissioned, great, day, accra, series, shot...",[],{},Commissioned by Apple A Great Day In Accra Se...
7,Court rules man must be given access to husban...,[apple],"[(Court, NNP), (rules, NNS), (man, NN), (must,...","[court, rules, man, must, given, access, husba...","[husband, access, photos, rules, man, given]",{},Court rules man must be given access to husban...
8,louder: OH MY GOD THE GOVERNMENT's SETTLED ST...,[],"[(louder, NN), (OH, NNP), (MY, NNP), (GOD, NNP...","[louder, god, government, settled, status, dig...",[louder],{},louder: OH MY GOD THE GOVERNMENT's SETTLED ST...
9,Commissioned by Apple A Great Day In Accra Se...,"[ShotOniPhone, Ghana, Music, Africa, Hiplife, ...","[(Commissioned, VBN), (by, IN), (A, DT), (Grea...","[commissioned, great, day, accra, series, shot...",[],{},Commissioned by Apple A Great Day In Accra Se...


In [None]:
#TO-DO: use text - hashtags - twitter handles => feed to Bing
#use tweet no handles text column instead for Bing API