In [2]:
#Populating word Features
import pandas as pd
import numpy
import string
import regex as re
import json
import os
import nltk
from nltk.corpus import wordnet
from datamuse import datamuse
import pycorenlp
from pycorenlp import StanfordCoreNLP
from nltk.stem import WordNetLemmatizer

Navigate to stanford-corenlp-4.5.4 folder and start core with “% java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 

Download from: https://stanfordnlp.github.io/CoreNLP/

In [8]:
# Initialize Datamuse API and StanfordCoreNLP
api = datamuse.Datamuse()
nlp = StanfordCoreNLP('http://localhost:9000')

# Set the paths for the input and output folders
folder_path = "cwishareddataset/testset/english/pickled-dataframes"
output_folder = "final_camb_feats_Test"

# Iterate over .pkl files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pkl'):
        # Check if the filename contains "WikiNews"
        Wikinews = True if 'WikiNews' in filename else False

        # Construct the file path
        file_path = os.path.join(folder_path, filename)

        # Read the .pkl file into a DataFrame
        data_frame = pd.read_pickle(file_path)
        
       
        data_frame.columns = ['ID', 'sentence', 'start_index', 'end_index', 'phrase', 'total_native', 'total_non_native', 'native_complex', 'non_native_complex', 'complex_binary', 'complex_probabilistic']


        # Perform data processing
        data_frame['split'] = data_frame['phrase'].apply(lambda x: x.split())
        data_frame['count'] = data_frame['split'].apply(lambda x: len(x))
        words = data_frame[data_frame['count'] == 1]
        MWEs = data_frame[data_frame['count'] >1]
        word_set = words.phrase.str.lower().unique()
        word_set = pd.DataFrame(word_set, columns=['phrase'])
        remove = string.punctuation.replace("-", "").replace("'", "") + '“”'
        pattern = r"[{}]".format(remove)
        word_set['phrase'] = word_set['phrase'].apply(lambda x: x.translate({ord(char): None for char in remove}))

        
        
        
        #function to obtain syablles for words
        # from datamuse import datamuse
        # api = datamuse.Datamuse()

        def get_syllables(word):
            syllables = 0
            word_results = api.words(sp=word, max=1, md='psf')
            if len(word_results)>0: 
                word = word_results[0]["word"]
                syllables = int(word_results[0]["numSyllables"])
            return syllables

        #Apply function to get syllables
        word_set['syllables'] = word_set['phrase'].apply(lambda x: get_syllables(x))

        #Apply function to get word length 
        word_set['length'] = word_set['phrase'].apply(lambda x: len(x))

        #take words and merge with values first you will need to clean the phrase column 
        words['original phrase'] = words['phrase']
        words['phrase'] = words['phrase'].str.lower()
        words['phrase'] = words['phrase'].apply(lambda x: x.translate({ord(char): None for char in remove}))

        word_features = pd.merge(words, word_set)
        
        #Now parse
        
        # nlp = StanfordCoreNLP('http://localhost:9000')

        sentences = data_frame[['sentence', 'ID']].copy()

        sentences = sentences.drop_duplicates()

        def removefirsttoken(x):
            x = x.split(' ', 1)[1]
            return x

        if Wikinews:
            sentences['clean sentence'] = sentences['sentence'].apply(lambda x: removefirsttoken(x))

        else:
            sentences['clean sentence'] = sentences['sentence']

        #function to parse sentences 
        def parse(string):
            output = nlp.annotate(string, properties={
          'annotators': 'pos,depparse',
          'outputFormat': 'json'
          })
            return output
        
        #apply parsing to sentences
        sentences['parse'] = sentences['clean sentence'].apply(lambda x: parse(x))

        sentences

        #Merge 
        word_parse_features = pd.merge(sentences, word_features)
        word_parse_features
        
        def get_pos(row):
            word = row['phrase']
            parse = json.loads(row['parse'])
            for i in range(len(parse['sentences'][0]['tokens'])):
                comp_word = parse['sentences'][0]['tokens'][i]['word']
                comp_word = comp_word.lower()
                comp_word = comp_word.translate({ord(char): None for char in remove})
                if comp_word == word:
                    return parse['sentences'][0]['tokens'][i]['pos']
        

        def get_dep(row):
            number = 0
            word = row['phrase']
            parse = json.loads(row['parse'])
            for i in range(len(parse['sentences'][0]['basicDependencies'])):
                comp_word = parse['sentences'][0]['basicDependencies'][i]['governorGloss']
                comp_word = comp_word.lower()
                comp_word = comp_word.translate({ord(char): None for char in remove})

                if comp_word == word:
                    number += 1

            return number

        #Function to get the proper lemma 
        

        def get_wordnet_pos(treebank_tag):
            from nltk.corpus import wordnet

            if treebank_tag.startswith('JJ'):
                return wordnet.ADJ
            elif treebank_tag.startswith('VB'):
                return wordnet.VERB
            elif treebank_tag.startswith('NN'):
                return wordnet.NOUN
            elif treebank_tag.startswith('RB'):
                return wordnet.ADV
            else:
                return None
            
        
        wordnet_lemmatizer = WordNetLemmatizer()
        def lemmatiser(row):

            word = row['phrase']
            pos = row['pos']

            try:
                lemma = wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(pos))
                return lemma
            except:
                try:
                    lemma = wordnet_lemmatizer.lemmatize(word)
                    return lemma
                except:
                    print(word)
                    
        #return MRC scores
        # mrc_features = pd.read_table('corpus/MRC.csv', names=('word', 'AOA', 'BFRQ', 'CNC', 'KFCAT', 'FAM', 'KFSMP', 'IMG', 'KFFRQ', 'NLET', 'CMEAN', 'PMEAN', 'NPHN', 'T-LFRQ'))
        mrc_features = pd.read_csv('corpus/MRC.csv', names=('id', 'NPHN', 'KFFRQ', 'KFCAT', 'KFSMP', 'T-LFRQ', 'FAM', 'CNC', 'IMG', 'AOA', 'word'), low_memory=False)





        def aoa(word):
            word = word.upper()  # Convert word to all capitals
            try:
                df = mrc_features.loc[mrc_features['word'] == word]
                fvalue = df.iloc[0]['AOA']
                return fvalue    
            except:
                return 0


        def CNC_fun(word):
            word = word.upper()
            table = mrc_features[mrc_features['word']==word]

            if len(table)>0:

                CNC = table['CNC'].values[0]
                CNC = int(CNC)

                return CNC
            else: 
                y=0
                return y

        def img(word):
            word = word.upper()
            try:
                df = mrc_features.loc[mrc_features['word'] == word]
                fvalue = df.iloc[0]['IMG']
                return fvalue    
            except:
                return 0







        def KFCAT_fun(word):
                word = word.upper()
                table = mrc_features[mrc_features['word']==word]

                if len(table)>0:

                    KFCAT = table['KFCAT'].values[0]
                    KFCAT = int(KFCAT)

                    return KFCAT
                else: 
                    y=0
                    return y

        def FAM_fun(word):
                word = word.upper()
                table = mrc_features[mrc_features['word']==word]

                if len(table)>0:

                    FAM = table['FAM'].values[0]
                    FAM = int(FAM)

                    return FAM
                else: 
                    y=0
                    return y

        def KFSMP_fun(word):
                word = word.upper()
                table = mrc_features[mrc_features['word']==word]

                if len(table)>0:

                    KFSMP = table['KFSMP'].values[0]
                    KFSMP = int(KFSMP)

                    return KFSMP
                else: 
                    y=0
                    return y

        def KFFRQ_fun(word):
                word = word.upper()
                table = mrc_features[mrc_features['word']==word]

                if len(table)>0:

                    KFFRQ = table['KFFRQ'].values[0]
                    KFFRQ = int(KFFRQ)

                    return KFFRQ
                else: 
                    y=0
                    return y

        # def NLET_fun(word):
        #         word = word.upper()
        #         table = mrc_features[mrc_features['word']==word]

        #         if len(table)>0:


        #             NLET = table['NLET'].values[0]
        #             NLET = int(NLET)

        #             return NLET
        #         else: 
        #             y=0
        #             return y

        def NPHN_fun(word):
                word = word.upper()
                table = mrc_features[mrc_features['word']==word]

                if len(table)>0:

                    NPHN = table['NPHN'].values[0]
                    NPHN = int(NPHN)

                    return NPHN
                else: 
                    y=0
                    return y

        def TLFRQ_fun(word):
                word = word.upper()
                table = mrc_features[mrc_features['word']==word]

                if len(table)>0:

                    TLFRQ = table['T-LFRQ'].values[0]
                    TLFRQ = int(TLFRQ)

                    return TLFRQ
                else: 
                    y=0
                    return y

        #functions using wordnet 
       
        def synonyms(word):
            synonyms=0
            try:
                results = wordnet.synsets(word)
                synonyms = len(results)
                return synonyms
            except:
                return synonyms

        def hypernyms(word):
            hypernyms=0
            try:
                results = wordnet.synsets(word)
                hypernyms = len(results[0].hypernyms())
                return hypernyms
            except:
                return hypernyms

        def hyponyms(word):
            hyponyms=0
            try:
                results = wordnet.synsets(word)
            except:
                return hyponyms
            try:
                hyponyms = len(results[0].hyponyms())
                return hyponyms
            except:
                return hyponyms

        #return CEFR levels
        # all_levels = pd.read_table('corpus/CALD.csv', names=('word', 'level'))

        # def levels(word):
        #     all_levels = pd.read_csv('corpus/cefrj-vocabulary-profile-1.5.csv')
        #     word = ''.join(word.split()).lower()
        #     df = all_levels.loc[all_levels['headword'] == word]
        #     if not df.empty:
        #         level = df.iloc[0]['CEFR']
        #         return level
        #     else:
        #         return 0

        def levels(word):
            word = ''.join(word.split()).lower()
            try:
                df = all_levels.loc[all_levels['word'] == word]
                level = df.iloc[0]['level']
                return level

            except:
                try:
                    df = all_levels.loc[all_levels['word'] == word]
                    level = df.iloc[0]['level']
                    return level
                except:
                    return 0
                
        #Convert tree bank tags to ones that are compatible w google 

        def is_noun(tag):
            return tag in ['NN', 'NNS', 'NNP', 'NNPS']


        def is_verb(tag):
            return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


        def is_adverb(tag):
            return tag in ['RB', 'RBR', 'RBS']


        def is_adjective(tag):
            return tag in ['JJ', 'JJR', 'JJS']


        def penn_to_wn(tag):
            if is_adjective(tag):
                return wn.ADJ
            elif is_noun(tag):
                return wn.NOUN
            elif is_adverb(tag):
                return wn.ADV
            elif is_verb(tag):
                return wn.VERB
            return None


        def penn_to_google(tag):
            if is_adjective(tag):
                return 'adj'
            elif is_noun(tag):
                return 'n'
            elif is_adverb(tag):
                return 'adv'
            elif is_verb(tag):
                return 'v'
            return None

        
        def get_frequency(row):
                nofreq = float(0.000000)
                word = row["phrase"]
                word = str(word)
                tag = row["pos"]
                tag = penn_to_google(tag)

                try:
                    word_results = api.words(sp=word, max=1, md='pf')
                    tag_list = (word_results[0]['tags'][:-1])

                    frequency = word_results[0]['tags'][-1][2:]

                    frequency = float(frequency)

                    if tag in tag_list :
                        return frequency
                    else:
                        lemma = row['lemma']
                        try:
                            word_results = api.words(sp=lemma, max=1, md='pf')
                            tag_list = (word_results[0]['tags'][:-1])

                            frequency = word_results[0]['tags'][-1][2:]

                            frequency = float(frequency)

                            if tag in tag_list:
                                return frequency
                            else:
                                return nofreq
                        except:
                            return nofreq

                except:


                    return nofreq 
                

        #GET DEP AND POS NUMBER
        word_parse_features['pos'] = word_parse_features.apply(get_pos, axis=1)
        word_parse_features['dep num'] = word_parse_features.apply(get_dep, axis=1)

        #To obtain word lemmas 
        #Get Lemma
        word_parse_features['lemma'] = word_parse_features.apply(lemmatiser, axis=1)

        #Apply function to get number of synonyms and hypernyms/hyponyms
        word_parse_features['synonyms'] = word_parse_features['lemma'].apply(lambda x: synonyms(x))
        word_parse_features['hypernyms'] = word_parse_features['lemma'].apply(lambda x: hypernyms(x))
        word_parse_features['hyponyms'] = word_parse_features['lemma'].apply(lambda x: hyponyms(x))

        #Apply function to check if contained in Ogden word set
        ogden = pd.read_table('binary-features/ogden.txt')
        word_parse_features['ogden'] = word_parse_features['lemma'].apply(lambda x : 1 if any(ogden.words == x) else 0) #clean words

        #Apply function to check if contained in simple wiki word set
        simple_wiki = pd.read_csv('binary-features/Most_Frequent.csv')
        word_parse_features['simple_wiki'] = word_parse_features['lemma'].apply(lambda x : 1 if any(simple_wiki.a == x) else 0) #clean words

        #Apply function to get the level from Cambridge Advanced Learner Dictionary
        # cald = pd.read_csv('binary-features/CALD.csv')
        # word_parse_features['cald'] = word_parse_features['phrase'].apply(lambda x : 1 if any(cald.a == x) else 0)
        # word_parse_features['cald'] = word_parse_features['phrase'].apply(lambda x: 1 if any(cald['Word'] == x) else 0)


        #Get some MRC features
        mrc_features = pd.read_csv('corpus/MRC.csv', names=('id','NPHN','KFFRQ','KFCAT','KFSMP','T-LFRQ','FAM','CNC','IMG','AOA', 'word'))    



        # word_parse_features['cnc'] = word_parse_features['lemma'].apply(lambda x: cnc(x))
        word_parse_features['CNC'] = word_parse_features['lemma'].apply(lambda x: CNC_fun(x) if x is not None else None)
        word_parse_features['IMG'] = word_parse_features['lemma'].apply(lambda x: img(x) if x is not None else None)


        #Apply function to check if contained  subimdb word set
        subimdb_500 = pd.read_csv('binary-features/subimbd_500.tsv', sep='\t')
        # subimdb_500 = pd.read_pickle('binary-features/subimbd_500.tsv')
        word_parse_features['sub_imdb'] = word_parse_features['lemma'].apply(lambda x : 1 if any(subimdb_500.words == x) else 0)

        #Apply function for google freq
        word_parse_features['google frequency'] = word_parse_features.apply(get_frequency ,axis=1)

        word_parse_features['phrase'] = word_parse_features.phrase.astype(str)
        word_parse_features['pos'] = word_parse_features.pos.astype(str)

        # word_parse_features['cnc'] = word_parse_features['lemma'].apply(lambda x: cnc(x))
        word_parse_features['CNC'] = word_parse_features['lemma'].apply(lambda x: CNC_fun(x))
        word_parse_features['IMG'] = word_parse_features['lemma'].apply(lambda x: img(x))

        word_parse_features['KFCAT']= word_parse_features['lemma'].apply(lambda x: KFCAT_fun(x))
        word_parse_features['FAM']= word_parse_features['lemma'].apply(lambda x: FAM_fun(x) )
        word_parse_features['KFSMP']= word_parse_features['lemma'].apply(lambda x: KFSMP_fun(x))
        word_parse_features['KFFRQ']= word_parse_features['lemma'].apply(lambda x: KFFRQ_fun(x))
        word_parse_features['AOA']= word_parse_features['lemma'].apply(lambda x: aoa(x))
        word_parse_features['NPHN']= word_parse_features['lemma'].apply(lambda x: NPHN_fun(x))
        word_parse_features['T-LFRQ']= word_parse_features['lemma'].apply(lambda x: TLFRQ_fun(x))
        
       
        
        
        # Combine single word dataframe and multiple words dataframe
        # combined_df = pd.concat([word_set, MWEs])

        # Sort combined dataframe by original dataframe's index
        # word_parse_features = combined_df.sort_index()
        
        # Combine word_parse_features and MWEs DataFrames NOT WORKING AS NEEDED
        # word_parse_features_with_MWEs = pd.concat([word_parse_features, MWEs])

        # Sort the combined DataFrame by the original order (index)
        # combined_df.sort_index(inplace=True)
        
        
        # Add back rows with multiple words in the "phrase" column in the original order
        word_parse_features = pd.concat([word_parse_features, MWEs]).sort_values(by='ID')
        
         
        # Fill any NaN values with zeros
        word_parse_features.fillna(0, inplace=True)
        
        # Save the processed DataFrame
        output_filename = os.path.splitext(filename)[0] + '_Final'
        output_file_path = os.path.join(output_folder, output_filename)
        word_parse_features.to_pickle(output_file_path)
        # word_parse_features_with_MWEs.to_pickle(output_file_path)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words['original phrase'] = words['phrase']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words['phrase'] = words['phrase'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words['phrase'] = words['phrase'].apply(lambda x: x.translate({ord(char): None for char in remove}))
  mrc_features 

In [9]:
word_parse_features

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,IMG,sub_imdb,google frequency,KFCAT,FAM,KFSMP,KFFRQ,AOA,NPHN,T-LFRQ
0,"The teenage girl shot dead in Bellaghy, County...",30OITAWPBQ4V08AHXM3N85FC9DQ9HB,"The teenage girl shot dead in Bellaghy, County...","{\n ""sentences"": [\n {\n ""index"": 0,\...",47,58,londonderry,10,10,1,...,0,0.0,0.428829,0.0,0.0,0.0,0.0,0,0.0,0.0
55,I'd say they were close in life and in death c...,30OITAWPBQ4V08AHXM3N85FC9DQ9HB,I'd say they were close in life and in death c...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",39,44,death,10,10,0,...,498,1.0,163.536308,15.0,581.0,132.0,277.0,0,3.0,815.0
54,I'd say they were close in life and in death c...,30OITAWPBQ4V08AHXM3N85FC9DQ9HB,I'd say they were close in life and in death c...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",18,23,close,10,10,0,...,420,1.0,120.189275,15.0,587.0,166.0,234.0,283,0.0,1862.0
53,I'd say they were close in life and in death c...,30OITAWPBQ4V08AHXM3N85FC9DQ9HB,I'd say they were close in life and in death c...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",27,31,life,10,10,0,...,482,1.0,484.140126,15.0,598.0,279.0,715.0,0,3.0,4804.0
52,"Fr Dolan told BBC Radio Ulster: ""When Brenda w...",30OITAWPBQ4V08AHXM3N85FC9DQ9HB,"Fr Dolan told BBC Radio Ulster: ""When Brenda w...","{\n ""sentences"": [\n {\n ""index"": 0,\...",123,127,life,10,10,0,...,482,1.0,484.140126,15.0,598.0,279.0,715.0,0,3.0,4804.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,"Her older sister, aged 21, lived at the rented...",3ZXV7Q5FJBPDKAQEEGPE7A47ZJGFC0,"Her older sister, aged 21, lived at the rented...","{\n ""sentences"": [\n {\n ""index"": 0,\...",105,116,established,10,10,6,...,0,0.0,38.498944,9.0,0.0,43.0,58.0,0,8.0,327.0
513,"Her older sister, aged 21, lived at the rented...",3ZXV7Q5FJBPDKAQEEGPE7A47ZJGFC0,"Her older sister, aged 21, lived at the rented...","{\n ""sentences"": [\n {\n ""index"": 0,\...",68,73,built,10,10,0,...,399,1.0,73.589952,14.0,554.0,56.0,86.0,0,0.0,306.0
512,"Her older sister, aged 21, lived at the rented...",3ZXV7Q5FJBPDKAQEEGPE7A47ZJGFC0,"Her older sister, aged 21, lived at the rented...","{\n ""sentences"": [\n {\n ""index"": 0,\...",59,67,recently,10,10,0,...,0,1.0,49.414182,0.0,0.0,0.0,0.0,0,0.0,0.0
526,It is believed she has been taken to the Royal...,3ZXV7Q5FJBPDKAQEEGPE7A47ZJGFC0,It is believed she has been taken to the Royal...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",56,64,hospital,10,10,0,...,602,1.0,58.359238,12.0,548.0,48.0,110.0,319,0.0,825.0


In [None]:
# import pandas as pd

# data = pd.read_pickle('final_run/Wikipedia_Train_actual')
# data.to_csv('final_run/Wikipedia_Train_actual.csv', index=False)