In [None]:
%%time
import json
import pandas as pd
import string
from datamuse import datamuse
from pycorenlp import StanfordCoreNLP 
import json
import nltk
from collections import defaultdict
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import requests


In [None]:
"""
Extracts the features required for the CWI basline system as described in CWI3G3G2 in (Yimam 2017), with only the most basic frequency features.
1) Number of vowels
2)Number of syllables
3) Number of characters

4) Frewuency in simple Wiki
5)Frquency of word in HIT paragraph
6) Frequency of word in Ngram corpus




"""

In [None]:
%%time

# 1- New preprocessing to handle hyphenated and MWEs
# Change location to file path of file for feature extraction 

location = "cwishareddataset/traindevset/english/Wikipedia_Train.tsv" 
import pandas as pd
import string


data_frame = pd.read_table(location, names=('ID', 'sentence', 'start_index', 'end_index', 'word', 'total_native',
                                            'total_non_native', 'native_complex', 'non_native_complex', 'complex_binary',
                                            'complex_probabilistic'), encoding='utf-8-sig')
data_frame = data_frame.astype(str)

# Cleaning function for words

data_frame['sentence'] = data_frame['sentence'].apply(lambda x: x.replace("%", "percent"))
data_frame['sentence'] = data_frame['sentence'].apply(lambda x: x.replace("’", "'"))


remove = string.punctuation
remove = remove.replace("-", "")
remove = remove.replace("(", "")
remove = remove.replace(",", "")
remove = remove.replace("'", "")# don't remove apostrophies 
remove = remove + '“'
remove = remove +'”'
pattern = r"[{}]".format(remove) # create the pattern


# Split the words and add them to the 'split' column, treating hyphens as separate words
data_frame['split'] = data_frame['word'].apply(lambda x: [word for word in x.replace('-', ' - ').split()])


# Split the words and add them to the 'split' column, treating hyphens as separate words
data_frame['split'] = data_frame['word'].apply(lambda x: [word for word in x.replace('-', ' - ').split()])
print("Cleaned DataFrame:")



In [None]:
%%time
#2 Get Datamuse data
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache

# Function to get total syllables using Datamuse API
@lru_cache(maxsize=None)
def get_total_syllables(word):
    base_url = 'https://api.datamuse.com/words'
    params = {
        'sp': word,
        'max': 1,
        'md': 'psf'
    }

    response = requests.get(base_url, params=params)
    data = response.json()

    if data and isinstance(data, list):
        word_data = data[0]
        if 'word' in word_data and 'numSyllables' in word_data:
            syllables = int(word_data['numSyllables'])
            return syllables

    return 0


# Function to process the DataFrame and print results
def process_dataframe(data_frame):
    data_frame['syllables'] = data_frame['word'].apply(lambda x: sum(get_total_syllables(word) for word in x))
    data_frame['vowels'] = data_frame['word'].apply(lambda x: sum(1 for word in x for char in word if char.lower() in "aeiou"))
    data_frame['characters'] = data_frame['word'].apply(lambda x: sum(len(word) for word in x))

    # Print values for each word
    for index, row in data_frame.iterrows():
        print(f"Word: {row['word']}, Syllables: {row['syllables']}, Vowels: {row['vowels']}, Characters: {row['characters']}")

    # Print the updated DataFrame
    print("Updated DataFrame:")
    print(data_frame)



# Call the function to process the DataFrame
process_dataframe(data_frame)


In [None]:
%%time
# 3 New faster code for Simple WIki data
# """
# In this updated version, the occurrence counts of unigrams, bigrams, trigrams, fourgrams, and fivegrams are calculated 
# separately and then combined into a single column called simple_wiki_freq. The counts are obtained by summing the 
# occurrence counts of each ngram type using the apply method with lambda functions.
# """
import pandas as pd
from collections import Counter
import nltk

# Function to count ngram occurrences in a file
def search_ngrams(file_path, ngrams):
    # Create a Counter object to store ngram occurrences
    ngram_occurrences = Counter()

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Convert to lowercase and split the line into words
            words = nltk.word_tokenize(line.lower())
            # Generate ngrams
            line_ngrams = list(nltk.ngrams(words, len(ngrams[0])))
            # Count ngram occurrences
            ngram_occurrences.update(line_ngrams)

    return ngram_occurrences


# Create a list of ngrams from the words in the word column
word_column_words = data_frame['word'].str.lower().str.split()
unigrams = [word for words in word_column_words for word in words]
bigrams = list(nltk.ngrams(word_column_words, 2))
trigrams = list(nltk.ngrams(word_column_words, 3))
fourgrams = list(nltk.ngrams(word_column_words, 4))
fivegrams = list(nltk.ngrams(word_column_words, 5))

# Count the occurrences of unigrams, bigrams, trigrams, fourgrams, and fivegrams in the Simple Wiki corpus
file_path = "corpus/simple_wiki.txt"
unigram_occurrences = search_ngrams(file_path, unigrams)
bigram_occurrences = search_ngrams(file_path, bigrams)
trigram_occurrences = search_ngrams(file_path, trigrams)
fourgram_occurrences = search_ngrams(file_path, fourgrams)
fivegram_occurrences = search_ngrams(file_path, fivegrams)

# Combine the occurrence counts of all ngrams into a single column
data_frame['simple_wiki_freq'] = (
    data_frame['word'].apply(lambda x: sum(unigram_occurrences[word] for word in x.lower().split())) +
    data_frame['word'].apply(lambda x: sum(bigram_occurrences[bigram] for bigram in nltk.ngrams(x.lower().split(), 2))) +
    data_frame['word'].apply(lambda x: sum(trigram_occurrences[trigram] for trigram in nltk.ngrams(x.lower().split(), 3))) +
    data_frame['word'].apply(lambda x: sum(fourgram_occurrences[fourgram] for fourgram in nltk.ngrams(x.lower().split(), 4))) +
    data_frame['word'].apply(lambda x: sum(fivegram_occurrences[fivegram] for fivegram in nltk.ngrams(x.lower().split(), 5)))
)


In [None]:
%%time
#  4 New faster code to get HIT frquency with MWEs

def HIT_freq(data_frame, words, ID):
    words_lower = words.lower().split()
    paragraph = data_frame[data_frame['ID'] == ID]
    word_count = paragraph['word'].str.lower().value_counts().to_dict()
    total_occurrences = paragraph['word'].str.lower().isin(words_lower).sum()
    return total_occurrences

word_counts = data_frame['word'].str.lower().value_counts().to_dict()
data_frame['HIT_count'] = data_frame['ID'].map(data_frame.groupby('ID')['word'].apply(lambda x: x.str.lower().isin(word_counts).sum()))


In [None]:
%%time
# 5 - New faster code to use multiple requests for Ngram data
import requests
from concurrent.futures import ThreadPoolExecutor

def get_ngram_counts(data_frame):
    abs_counts = []
    rel_counts = []

    session = requests.Session()  # Create a session object for reusing connections

    def process_word(word):
        url = 'https://api.ngrams.dev/eng/search'
        params = {
            'query': word,
            'flags': 'cs',
            'limit': 1
        }

        response = session.get(url, params=params, verify=False)  # Reuse the session for subsequent requests
        data = response.json()

        if 'ngrams' in data and len(data['ngrams']) > 0:
            abs_count = data['ngrams'][0]['absTotalMatchCount']
            rel_count = data['ngrams'][0]['relTotalMatchCount']
        else:
            abs_count = None
            rel_count = None

        return abs_count, rel_count

    words = data_frame['word']
    num_workers = min(len(words), 10)  # Adjust the number of workers as per your requirements
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = executor.map(process_word, words)

    for abs_count, rel_count in results:
        abs_counts.append(abs_count)
        rel_counts.append(rel_count)

    data_frame['absTotalMatchCount'] = abs_counts
    data_frame['relTotalMatchCount'] = rel_counts

    return data_frame
get_ngram_counts(data_frame)

In [None]:
%%time
# 6 Parse sentences
#Start core from command line

print("start core")
nlp = StanfordCoreNLP('http://localhost:9000')

sentences = data_frame[['sentence', 'ID']].copy()

sentences = sentences.drop_duplicates()

print("end core")


In [None]:
%%time

# 7 

print("start parse sentence")

def parse(string):
    output = nlp.annotate(string, properties={
        'annotators': 'pos,depparse,ner',
        'outputFormat': 'json'
    })
    output_dict = json.loads(output)
    return output_dict
    # return output

print("finish parsing sentence")

In [None]:
%%time
# 8 Function to parse a sentence and return the parsed output as a string
def parse_sentence(sentence):
    parsed_output = parse(sentence)
    return json.dumps(parsed_output)

# Apply the parse_sentence function to the 'sentence' column and store the parsed output in a new 'Parse' column
data_frame['Parse'] = data_frame['sentence'].apply(parse_sentence)

# Print the updated dataframe
data_frame


In [None]:
%%time
# 9 - Get dependencies
import json

def get_dep(row):
    number = 0
    word = row['word']
    parse = json.loads(row['Parse'])

    for dependency in parse['sentences'][0]['basicDependencies']:
        comp_word = dependency['governorGloss']
        comp_word = comp_word.lower()
        comp_word = comp_word.translate({ord(char): None for char in remove})

        if comp_word == word:
            number += 1

    return number
data_frame['dep'] = data_frame.apply(get_dep, axis=1)

In [None]:
%%time
# 10 - Get Entity
def get_ner(row):
    word = row['word']
    parse = json.loads(row['Parse'])

    for token in parse['sentences'][0]['entitymentions']:
        if token['text'] == word:
            return token['ner']

data_frame['ner'] = data_frame.apply(get_ner, axis=1)


In [None]:
%%time
# 11 - Get POS
def get_pos(row):
    word = row['word']
    parse = row['Parse']
    
    parse_dict = json.loads(parse)
    tokens = parse_dict['sentences'][0]['tokens']
    
    for token in tokens:
        comp_word = token['word'].lower()
        comp_word = comp_word.translate({ord(char): None for char in remove})
        
        if comp_word == word.lower():
            return token['pos']
    
    return None
data_frame['pos'] = data_frame.apply(get_pos, axis=1)

In [None]:
# 12 - Function to get the proper lemma

def get_lemma(row):
    word = row['word']
    parse = row['Parse']
    
    parse_dict = json.loads(parse)
    tokens = parse_dict['sentences'][0]['tokens']
    
    for token in tokens:
        comp_word = token['word'].lower()
        comp_word = comp_word.translate({ord(char): None for char in remove})
        
        if comp_word == word.lower():
            return token['lemma']
    
    return None
data_frame['lemma'] = data_frame.apply(get_lemma, axis=1)

In [None]:
# 13 - 

import os
import pickle

# Get the base filename from the location variable
base_filename = os.path.splitext(os.path.basename(location))[0]

# Specify the output file path
output_file_path = os.path.join('features_NEW', base_filename + '_NEW_Feats1.pkl')

# Pickle the word_features DataFrame
with open(output_file_path, 'wb') as file:
    pickle.dump(data_frame, file)

In [None]:
data_frame.to_csv('output1.csv', index=False)

In [None]:

print("start tagging")
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if pd.isna(treebank_tag):
        return None

    if isinstance(treebank_tag, str) and treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif isinstance(treebank_tag, str) and treebank_tag.startswith('V'):
        return wordnet.VERB
    elif isinstance(treebank_tag, str) and treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif isinstance(treebank_tag, str) and treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [None]:
 # Convert tree bank tags to ones that are compatible w google


def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wordnet.ADJ
    elif is_noun(tag):
        return wordnet.NOUN
    elif is_adverb(tag):
        return wordnet.ADV
    elif is_verb(tag):
        return wordnet.VERB
    return None

def penn_to_google(tag):
    if is_adjective(tag):
        return 'adj'
    elif is_noun(tag):
        return 'n'
    elif is_adverb(tag):
        return 'adv'
    elif is_verb(tag):
        return 'v'
    return None

In [None]:
data_frame.to_csv("debugging/word_parse_features_bug.csv" , index=False) 

In [None]:
import os
import pickle

# Get the base filename from the location variable
base_filename = os.path.splitext(os.path.basename(location))[0]

# Specify the output file path
output_file_path = os.path.join('features_NEW', base_filename + '_NEW_Feats.pkl')

# Pickle the word_features DataFrame
with open(output_file_path, 'wb') as file:
    pickle.dump(data_frame, file)