In [1]:
import sys
sys.path.insert(0, '/data/critt/shared/Spring19/')

import kent
import importlib
importlib.reload(kent)

<module 'kent' from '/data/critt/shared/Spring19/kent.py'>

### Load the BML12 data frame

In [2]:
df = kent.readTPDDBtables(['BML12/Tables/'], "*st", path='/data/critt/tprdb/TPRDB/')

### Load the Spanish and English Lev_1 dictionaries

In [4]:
import json
f = open("spanish_lev1_dic.json","r").read()
spanish_lev1_dic = json.loads(f)

In [6]:
f = open("/users/kent/dsahoo/stoken_lev1_wordlist_new.json","r").read()
english_lev1_dic = json.loads(f)

In [7]:
print(f"Spanish: {len(spanish_lev1_dic)}, English: {len(english_lev1_dic)}")

Spanish: 433, English: 413


### Load the Spanish and English Frequency dictionaries

In [15]:
english_dic = open('english_freq.json','r').read()
english_freq_dic = json.loads(english_dic)

In [18]:
spanish_dic = open('spanish_freq.json','r').read()
spanish_freq_dic = json.loads(spanish_dic)

In [21]:
print(f"English Freq dic: {len(english_freq_dic)}, Spanish Freq dic: {len(spanish_freq_dic)}")

English Freq dic: 604205, Spanish Freq dic: 3225296


In [85]:
import nltk
import numpy as np

# Returns a list of semantic competitor words based on the score_limit
def semantic_sim_set(comp_tuple_list, score_limit = 0.7, include_stopwords = False):
    comp_list = []
    
    comp_list = [(word,score) for word,score in comp_tuple_list if word.isalpha() and score >= score_limit ]
    return comp_list


# Creates a dictionary of word to comp_list for each word in the source_words list
def semantic_similar_words_dic(stoken_list, model, topn=100, score_limit=0.7):
    token_to_semantic_words_dic = dict()
    for stoken in set(stoken_list):
        token_to_semantic_words_dic[stoken] = get_semantic_similar_word_list(stoken, model)
        
    return token_to_semantic_words_dic

def get_semantic_similar_word_list(stoken, model=glove_model, topn=100, score_limit=0.7):
    """
    This function gets the semantic similar words of the input stoken
    Input
        stoken - source token from tpr db
        model - word2vec model
        topn - maximum number of words to find from the model ( default = 100)
        score_limit - threshold similarity score(default = 0.7)
    """
    # Throws KeyError if word not in vocabulary
    try:
        result = model.most_similar(stoken.lower(), negative=None, topn=topn)
    except KeyError:
        result = []
    comp_list = semantic_sim_set(result,score_limit=score_limit)
    
    return comp_list

## To be used by function summation_freq_simscore()
def eval_freq_score(word, score, log_scale, min_freq=20, freq_dic=bnc_freq_dic):
    freq = bnc_freq_dic.get(word)
    if not freq:
        freq = min_freq
    if log_scale:
        return np.log2(freq)*score
    else:
        return freq*score
    
# Calculates entropy given an input of list containing frequency
def calc_entropy(freq_list):
    entropy = 0.0
    if freq_list: 
        total_freq = sum(freq_list)
        prob_list = [f/total_freq for f in freq_list]
        entropy = sum([-p*np.log(p) for p in prob_list])
    return entropy

## Below functions can be applied to a column using .apply() function
    
def summation_freq_simscore(stoken, freq_dic=bnc_freq_dic, model=glove_model, min_freq=20,log_scale=True):
    score = 0.0
    sim_list = get_semantic_similar_word_list(stoken, model)
    if not sim_list:
        return score
    #summ_list = [score*freq_dic.get(word) if freq_dic.get(word)  else min_freq*score for word,score in sim_list]
    summ_list = [eval_freq_score(word,score,log_scale) for word,score in sim_list]
        
    return sum(summ_list)

def sum_sim_scores(stoken, model=glove_model):
    score_sum = 0.0
    sim_list = get_semantic_similar_word_list(stoken, model)
    if sim_list:
        score_list = [score for word,score in sim_list]
        score_sum = sum(score_list)
    return(score_sum)


def entropy_semantic_sim(stoken, model=glove_model, freq_dic=bnc_freq_dic, min_freq=20):
    entropy = 0.0
    sim_list = get_semantic_similar_word_list(stoken, model)
    if sim_list:
        freq_list = [ freq_dic.get(word) if freq_dic.get(word) else min_freq for word,freq in sim_list ]
        entropy = calc_entropy(freq_list) 
    
    return entropy
    
def entropy_ortho_sim(stoken, lev_dic=word_to_lev_dic1, freq_dic=bnc_freq_dic, min_freq=20, verbose=False):
    """
    Calculates the entropy of the orthographic similar words(SS) of an input stoken taking into consideration
    the frequency of the SS in the freq_dic
    Input
        stoken - source token from TPR db
        lev_dic - dictionary with levenshtein's distance=1
        freq_dic - dictionary containing words with its frequency
        min_freq - min freq of the word (default=20)
    """
    entropy = 0.0
    
    sim_words = lev_dic.get(stoken)
    if sim_words: 
        if verbose:
            print(f"Similar words:\n\n{sim_words}")
        freq_list = [ freq_dic.get(word) if freq_dic.get(word) else min_freq for word in sim_words ]
        if verbose:
            print(f"Similar words freq:\n\n{freq_list}")
        entropy = calc_entropy(freq_list) 
    return entropy
    

NameError: name 'model' is not defined

### Universal implementation

In [30]:
spanish_tgroup_single = set(df[df.TGroup.str.isalpha()]['TGroup'].tolist())

In [43]:
# Create a new column for TGroup with only single entries
df['Single_TToken'] = df['TGroup'].apply(lambda x: x if x.isalpha() else '')

In [50]:
def tokens_per_million(token, dic=spanish_freq_dic):
    tpm = 0.0
    dic_len = len(dic)
    token_freq = dic.get(token)
    if token_freq:
        tpm = token_freq/dic_len * 1000000
        
    return round(tpm,2)

In [52]:
df['TTokens_per_mil'] = df['Single_TToken'].apply(tokens_per_million)

In [96]:
df['SToken_per_mil'] = df['SToken'].apply(lambda token: tokens_per_million(token, english_freq_dic))

In [55]:
df[(df.TTokens_per_mil != 0)][['TTokens_per_mil','ProbT']].corr()

Unnamed: 0,TTokens_per_mil,ProbT
TTokens_per_mil,1.0,0.172786
ProbT,0.172786,1.0


In [97]:
df[(df.SToken == 'academic')][['Text','Id','SToken','TGroup','Single_TToken','ProbT','AltT','CountT','TTokens_per_mil','SToken_per_mil']]

Unnamed: 0,Text,Id,SToken,TGroup,Single_TToken,ProbT,AltT,CountT,TTokens_per_mil,SToken_per_mil
5,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
1165,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
2268,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
2546,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
3706,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
4809,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
5087,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
6642,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
7489,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12
7628,5,6,academic,académica,académica,0.7586,3,22,5614.06,7926.12


In [68]:
def orthoSimCount(token,dic):
    count = 0
    dic_count = dic.get(token)
    if dic_count:
        count = len(dic_count)
        
    return count


In [69]:
df['spanish_orthoSim'] = df['SToken'].apply(lambda x: orthoSimCount(x, spanish_lev1_dic))
df['english_orthoSim'] = df['SToken'].apply(lambda x: orthoSimCount(x, english_lev1_dic))

In [71]:
df['orthoSim'] = df['spanish_orthoSim'] + df['english_orthoSim']

In [78]:
df['STlen'] = df['SToken'].apply(len)

In [82]:
df[(df.orthoSim > 0) & (df.Dur > 0) & (df.STlen > 4)][['Dur','spanish_orthoSim','english_orthoSim','orthoSim']].corr()

Unnamed: 0,Dur,spanish_orthoSim,english_orthoSim,orthoSim
Dur,1.0,-0.00038,0.019208,0.007892
spanish_orthoSim,-0.00038,1.0,0.719418,0.955709
english_orthoSim,0.019208,0.719418,1.0,0.891978
orthoSim,0.007892,0.955709,0.891978,1.0
