"""
This pipeline first extracts features from the 'phrase' column using a CountVectorizer, which creates a "bag-of-words" representation of the text data. The 'absTotalMatchCount' and 'relTotalMatchCount' columns are scaled using a StandardScaler to ensure that these features have a mean of 0 and standard deviation of 1, which can help the AdaBoost algorithm converge faster.

Finally, these features are combined and passed to the AdaBoostClassifier. You can then train this pipeline on your training data and use it to predict whether phrases are complex or not.

This is just one possible pipeline. You might need to adjust it to fit your specific needs. In particular, you might need to select different transformations for the text data or use different methods for scaling or normalizing the numeric features.


"""

In [15]:
import pandas as pd

data_path = "cwishareddataset/traindevset/english/pickled-dataframes/Wikipedia_Dev.csv"
data_frame = pd.read_csv(data_path)

data_frame.head()  # Display the first few rows of the dataframe

Unnamed: 0,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual vertebrae towards the tip which resembled the feather-anchoring pygostyle of modern birds and some oviraptorosaurs .,4,8,tail,10,10.1,0,2,1,0.1
0,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,12,25,Epidexipteryx,10,10,6,3,1,0.45
1,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,31,35,bore,10,10,0,3,1,0.15
2,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,31,53,bore unusual vertebrae,10,10,0,1,1,0.05
3,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,36,53,unusual vertebrae,10,10,2,0,1,0.1
4,3QI9WAYOGQCX8YMZA9CAS9VCVMWS62,The tail of Epidexipteryx also bore unusual ve...,36,43,unusual,10,10,0,0,0,0.0


In [35]:
import os
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
N_gram_Model = "N_gram_Model"
# Function to train the classifier
def train_classifier(train_data, output_model_file):
    # Extract features and target variables
    train_features = train_data[['absTotalMatchCount', 'relTotalMatchCount']]
    train_targets = train_data.iloc[:, 9]  # Select the 10th column as the target variable

    # Build and train the classifier
    classifier = AdaBoostClassifier(n_estimators=5000, random_state=67)
    classifier.fit(train_features, train_targets)

    # Save the trained model
    joblib.dump(classifier, output_model_file)


train_classifier(N_gram_Model,"Desktop/CWI_masters/camb_model/cwi_2018-master/Ngram_data/predictions")

TypeError: string indices must be integers

In [36]:
import os
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Function to train the classifier
def train_classifier(train_data, output_model_file):
    # Extract features and target variables
    train_features = train_data[['absTotalMatchCount', 'relTotalMatchCount']]
    train_targets = train_data.iloc[:, 9]  # Select the 10th column as the target variable

    # Build and train the classifier
    classifier = AdaBoostClassifier(n_estimators=5000, random_state=67)
    classifier.fit(train_features, train_targets)

    # Save the trained model
    joblib.dump(classifier, output_model_file)

# Define the file paths for training data and output model file
train_data_file = "Ngram_data/lexicon_ngram_train.tsv"
output_model_file = "Desktop/CWI_masters/camb_model/cwi_2018-master/Ngram_data/predictions"

# Load the training data
train_data = pd.read_csv(train_data_file, sep='\t')

# Train the classifier and save the model
train_classifier(train_data, output_model_file)


FileNotFoundError: [Errno 2] No such file or directory: 'Desktop/CWI_masters/camb_model/cwi_2018-master/Ngram_data/predictions'

In [None]:

# Function to run the classifier and evaluate it
def evaluate_classifier(test_data, model_file):
    # Load the trained model
    classifier = joblib.load(model_file)

    # Extract features and target variables
    test_features = test_data[['absTotalMatchCount', 'relTotalMatchCount']]
    test_targets = test_data.iloc[:, 9]  # Select the 10th column as the target variable

    # Convert 'phrase' column values to string
    test_data['phrase'] = test_data['phrase'].astype(str)

    # Print the values of the 4th column (phrase) before evaluating
    print("Values of the 4th column (phrase):")
    print(test_data['phrase'])
    print()

    # Make predictions on the test set
    test_predictions = classifier.predict(test_features)

    # Print the evaluated strings and their labels
    print("Evaluated strings:")
    for index, row in test_data.iterrows():
        phrase = row['phrase']
        predicted_label = test_predictions[index]
        real_label = test_targets[index]
        if ' ' in phrase:
            print("Phrase:", phrase)
            print("Predicted Label:", predicted_label)
            print("Real Label:", real_label)
            print()

    # Evaluate the performance of the classifier
    accuracy = accuracy_score(test_targets, test_predictions)
    precision = precision_score(test_targets, test_predictions)
    recall = recall_score(test_targets, test_predictions)
    f1 = f1_score(test_targets, test_predictions)

    # Print the evaluation metrics
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)





# Load the lexicon data
train_data = pd.read_csv('Ngram_data/lexicon_ngram_train.tsv', sep='\t')
test_data = pd.read_csv('Ngram_data/lexicon_ngram_test.tsv', sep='\t')

# Check if the fourth column contains multiple words
if train_data.iloc[:, 3].str.contains(' ').any() and test_data.iloc[:, 3].str.contains(' ').any():
    # Train the classifier and save the model
    train_classifier(train_data, 'basic_NgramModel')

    # Run the classifier and evaluate it
    evaluate_classifier(test_data, 'basic_NgramModel')


In [20]:
import os
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor

# Function to get ngram counts
def get_ngram_counts(data_frame):
    abs_counts = []
    rel_counts = []

    session = requests.Session()  # Create a session object for reusing connections

    def process_phrase(phrase):
        url = 'https://api.ngrams.dev/eng/search'
        params = {
            'query': phrase,
            'flags': 'cs',
            'limit': 1
        }

        response = session.get(url, params=params, verify=True)  # Reuse the session for subsequent requests
        data = response.json()

        if 'ngrams' in data and len(data['ngrams']) > 0:
            abs_count = data['ngrams'][0]['absTotalMatchCount']
            rel_count = data['ngrams'][0]['relTotalMatchCount']
        else:
            abs_count = None
            rel_count = None

        return abs_count, rel_count

    phrases = data_frame.iloc[:, 3]
    num_workers = min(len(phrases), 10)  # Adjust the number of workers as per your requirements
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = executor.map(process_phrase, phrases)

    for abs_count, rel_count in results:
        abs_counts.append(abs_count)
        rel_counts.append(rel_count)

    data_frame['absTotalMatchCount'] = abs_counts
    data_frame['relTotalMatchCount'] = rel_counts

    return data_frame

data_directory = "cwishareddataset/traindevset/english/pickled-dataframes"
output_directory = "Ngram_data"  # Specify the directory where you want to save the lexicon files

for file in os.listdir(data_directory):
    if file.endswith("_Dev.pkl"):
        data_frame = pd.read_pickle(os.path.join(data_directory, file))
        processed_data_frame = get_ngram_counts(data_frame)
        output_file = os.path.join(output_directory, "lexicon_ngram_test.tsv")
        processed_data_frame.to_csv(output_file, sep='\t', index=False)
    elif file.endswith("_Train.pkl"):
        data_frame = pd.read_pickle(os.path.join(data_directory, file))
        processed_data_frame = get_ngram_counts(data_frame)
        output_file = os.path.join(output_directory, "lexicon_ngram_train.tsv")
        processed_data_frame.to_csv(output_file, sep='\t', index=False)

data_frame.head()  # Display the first few rows of the dataframe


Unnamed: 0,374UMBUHN5QN3F8F90U3OEJ8SKCTCW,"#26-7 Initially, all three were considered victims, but the status of one has been changed to suspect.",6,15,Initially,10,10.1,7,1,1.1,0.4,absTotalMatchCount,relTotalMatchCount
0,374UMBUHN5QN3F8F90U3OEJ8SKCTCW,"#26-7 Initially, all three were considered vic...",32,42,considered,10,10,2,3,1,0.25,109512829,5.5e-05
1,374UMBUHN5QN3F8F90U3OEJ8SKCTCW,"#26-7 Initially, all three were considered vic...",43,50,victims,10,10,4,2,1,0.3,305606548,0.000153
2,374UMBUHN5QN3F8F90U3OEJ8SKCTCW,"#26-7 Initially, all three were considered vic...",60,66,status,10,10,3,1,1,0.2,63091031,3.2e-05
3,374UMBUHN5QN3F8F90U3OEJ8SKCTCW,"#26-7 Initially, all three were considered vic...",83,90,changed,10,10,0,2,1,0.1,122923946,6.2e-05
4,374UMBUHN5QN3F8F90U3OEJ8SKCTCW,"#26-7 Initially, all three were considered vic...",94,101,suspect,10,10,3,2,1,0.25,54165465,2.7e-05


In [18]:
data_frame.to_csv('lexicon_Ngram.tsv', sep='\t', index=False)

In [12]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

# Define a feature extraction pipeline for the phrase
phrase_features = Pipeline([
    ('selector', TextSelector(key='phrase')),
    ('vect', CountVectorizer())
])

# Define feature scaling pipelines for the ngram counts
abs_counts_scaler = Pipeline([
    ('selector', NumberSelector(key='absTotalMatchCount')),
    ('standard', StandardScaler())
])

rel_counts_scaler = Pipeline([
    ('selector', NumberSelector(key='relTotalMatchCount')),
    ('standard', StandardScaler())
])

# Combine these into a feature union
feats = FeatureUnion([
    ('phrase', phrase_features),
    ('abs_counts', abs_counts_scaler),
    ('rel_counts', rel_counts_scaler),
])

# Define the final pipeline as a combination of feature extraction and AdaBoost
pipeline = Pipeline([
    ('features', feats),
    ('classifier', AdaBoostClassifier(n_estimators=5000, random_state=67)),
])




# Read all .pkl files and split into training and test sets
data_path = "cwishareddataset/traindevset/english/pickled-dataframes"
files = os.listdir(data_path)

train_data = pd.concat([pd.read_pickle(os.path.join(data_path, file)) for file in files if file.endswith('_Train.pkl')])
test_data = pd.concat([pd.read_pickle(os.path.join(data_path, file)) for file in files if file.endswith('_Dev.pkl')])

# Rename the columns
column_names = ['ID', 'sentence', 'start_index', 'end_index', 'phrase', 'total_native', 'total_non_native', 'native_complex', 'non_native_complex', 'complex_binary', 'complex_probabilistic']
train_data.columns = column_names
test_data.columns = column_names
# Extract the target variables from the 10th column of the data
train_targets = train_data.iloc[:, 9].values
test_targets = test_data.iloc[:, 9].values


    
# Extract the phrase variables from the 4th column of the data
train_data['phrase'] = train_data.iloc[:, 4]
test_data['phrase'] = test_data.iloc[:, 4]

# Check the type of each element in the 'phrase' column
non_string_data = train_data[train_data['phrase'].apply(lambda x: not isinstance(x, str))]

print(f"Number of non-string entries: {len(non_string_data)}")

if len(non_string_data) > 0:
    print("Some examples of non-string entries:")
    print(non_string_data.head())
# Use FeatureUnion and pipelines to process features and fit the model
pipeline = Pipeline([
    ('features', feats),
    ('classifier', AdaBoostClassifier(n_estimators=5000, random_state=67)),
])

pipeline.fit(train_data, train_targets)

# Apply the classifier pipeline to the data
test_data['complex_prediction'] = pipeline.predict(test_data)

# Calculate accuracy, precision, recall, and f1-score
accuracy = accuracy_score(test_targets, test_data['complex_prediction'])
precision = precision_score(test_targets, test_data['complex_prediction'])
recall = recall_score(test_targets, test_data['complex_prediction'])
f1 = f1_score(test_targets, test_data['complex_prediction'])

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)


ValueError: Length mismatch: Expected axis has 23 elements, new values have 11 elements

In [None]:
def classify_and_evaluate(df):
    # Apply the classifier pipeline to the data
    df['complex_prediction'] = pipeline.predict(df)
    
    # Calculate accuracy, precision, recall, and f1-score
    evaluation_data = []
    for phrase in df['phrase'].unique():
        phrase_data = df[df['phrase'] == phrase]
        accuracy = accuracy_score(phrase_data['complex_binary'], phrase_data['complex_prediction'])
        precision = precision_score(phrase_data['complex_binary'], phrase_data['complex_prediction'])
        recall = recall_score(phrase_data['complex_binary'], phrase_data['complex_prediction'])
        f1 = f1_score(phrase_data['complex_binary'], phrase_data['complex_prediction'])
        evaluation_data.append([phrase, accuracy, precision, recall, f1])
    
    # Convert the evaluation data into a DataFrame
    evaluation_df = pd.DataFrame(evaluation_data, columns=['phrase', 'accuracy', 'precision', 'recall', 'f1_score'])
    
    return evaluation_df


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

# Define a feature extraction pipeline for the phrase
phrase_features = Pipeline([
    ('selector', TextSelector(key='phrase')),
    ('vect', CountVectorizer())
])

# Define feature scaling pipelines for the ngram counts
abs_counts_scaler = Pipeline([
    ('selector', NumberSelector(key='absTotalMatchCount')),
    ('standard', StandardScaler())
])

rel_counts_scaler = Pipeline([
    ('selector', NumberSelector(key='relTotalMatchCount')),
    ('standard', StandardScaler())
])

# Combine these into a feature union
feats = FeatureUnion([
    ('phrase', phrase_features),
    ('abs_counts', abs_counts_scaler),
    ('rel_counts', rel_counts_scaler),
])

# Define the final pipeline as a combination of feature extraction and AdaBoost
pipeline = Pipeline([
    ('features', feats),
    ('classifier', AdaBoostClassifier(n_estimators=5000, random_state=67)),
])

# Read all .pkl files and split into training and test sets
data_path = "cwishareddataset/traindevset/english/pickled-dataframes"
files = os.listdir(data_path)

train_data = pd.concat([pd.read_pickle(os.path.join(data_path, file)) for file in files if file.endswith('_Train.pkl')])
test_data = pd.concat([pd.read_pickle(os.path.join(data_path, file)) for file in files if file.endswith('_Dev.pkl')])

# Rename the columns
column_names = ['ID', 'sentence', 'start_index', 'end_index', 'phrase', 'total_native', 'total_non_native', 'native_complex', 'non_native_complex', 'complex_binary', 'complex_probabilistic']
train_data.columns = column_names
test_data.columns = column_names

# Extract the target variables from the 10th column of the data
train_targets = train_data.iloc[:, 9].values
test_targets = test_data.iloc[:, 9].values

# Extract the phrase variables from the 4th column of the data
train_data['phrase'] = train_data.iloc[:, 4]
test_data['phrase'] = test_data.iloc[:, 4]

# Check the type of each element in the 'phrase' column
non_string_data = train_data[train_data['phrase'].apply(lambda x: not isinstance(x, str))]

print(f"Number of non-string entries: {len(non_string_data)}")

if len(non_string_data) > 0:
    print("Some examples of non-string entries:")
    print(non_string_data.head())

# Use FeatureUnion and pipelines to process features and fit the model
pipeline = Pipeline([
    ('features', feats),
    ('classifier', AdaBoostClassifier(n_estimators=5000, random_state=67)),
])

pipeline.fit(train_data, train_targets)

# Apply the classifier pipeline to the data
test_data['complex_prediction'] = pipeline.predict(test_data)

# Calculate accuracy, precision, recall, and f1-score
accuracy = accuracy_score(test_targets, test_data['complex_prediction'])
precision = precision_score(test_targets, test_data['complex_prediction'])
recall = recall_score(test_targets, test_data['complex_prediction'])
f1 = f1_score(test_targets, test_data['complex_prediction'])

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)


In [64]:
###Code for returning word feature for MWEs

import pandas as pd

# Load the pickled DataFrame from the .pkl file
data_frame = pd.read_pickle('final_camb_feats/WikiNews_Train_actual')

# Save the DataFrame as a .csv file
data_frame.to_csv('final_camb_feats/inspect.csv', index=False)

In [65]:
#Populating word Features
import pandas as pd
import numpy
import string
import regex as re
import json
import os

# Read the .pkl file into a DataFrame
file_path = 'cwishareddataset/traindevset/english/pickled-dataframes/News_Dev.pkl'
data_frame = pd.read_pickle(file_path)


data_frame.columns = ['ID', 'sentence', 'start_index', 'end_index', 'phrase', 'total_native', 'total_non_native', 'native_complex', 'non_native_complex', 'complex_binary', 'complex_probabilistic']


# Perform data processing
data_frame['split'] = data_frame['phrase'].apply(lambda x: x.split())
data_frame['count'] = data_frame['split'].apply(lambda x: len(x))
words = data_frame[data_frame['count'] == 1]
MWEs = data_frame[data_frame['count'] >1]
word_set = words.phrase.str.lower().unique()
word_set = pd.DataFrame(word_set, columns=['phrase'])
remove = string.punctuation.replace("-", "").replace("'", "") + '“”'
pattern = r"[{}]".format(remove)
word_set['phrase'] = word_set['phrase'].apply(lambda x: x.translate({ord(char): None for char in remove}))

In [85]:


#function to obtain syablles for words
from datamuse import datamuse
api = datamuse.Datamuse()

def get_syllables(word):
    syllables = 0
    word_results = api.words(sp=word, max=1, md='psf')
    if len(word_results)>0: 
        word = word_results[0]["word"]
        syllables = int(word_results[0]["numSyllables"])
    return syllables

# #Apply function to get syllables
# word_set['syllables'] = word_set['phrase'].apply(lambda x: get_syllables(x))

# #Apply function to get word length 
# word_set['length'] = word_set['phrase'].apply(lambda x: len(x))

# #take words and merge with values first you will need to clean the phrase column 
# words['original phrase'] = words['phrase']
# words['phrase'] = words['phrase'].str.lower()
# words['phrase'] = words['phrase'].apply(lambda x: x.translate({ord(char): None for char in remove}))

word_features = pd.merge(words, word_set)

#Now parse
import pycorenlp
import pandas as pd
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

sentences = data_frame[['sentence', 'ID']].copy()

sentences = sentences.drop_duplicates()

def removefirsttoken(x):
    x = x.split(' ', 1)[1]
    return x

# if Wikinews:
#     sentences['clean sentence'] = sentences['sentence'].apply(lambda x: removefirsttoken(x))


sentences['clean sentence'] = sentences['sentence']

#function to parse sentences 
def parse(string):
    output = nlp.annotate(string, properties={
  'annotators': 'pos,depparse',
  'outputFormat': 'json'
  })
    return output

#apply parsing to sentences
sentences['parse'] = sentences['clean sentence'].apply(lambda x: parse(x))

sentences

#Merge 
word_parse_features = pd.merge(sentences, word_features)
word_parse_features

def get_pos(row):
    word = row['phrase']
    parse = json.loads(row['parse'])
    for i in range(len(parse['sentences'][0]['tokens'])):
        comp_word = parse['sentences'][0]['tokens'][i]['word']
        comp_word = comp_word.lower()
        comp_word = comp_word.translate({ord(char): None for char in remove})
        if comp_word == word:
            return parse['sentences'][0]['tokens'][i]['pos']


def get_dep(row):
    number = 0
    word = row['phrase']
    parse = json.loads(row['parse'])
    for i in range(len(parse['sentences'][0]['basicDependencies'])):
        comp_word = parse['sentences'][0]['basicDependencies'][i]['governorGloss']
        comp_word = comp_word.lower()
        comp_word = comp_word.translate({ord(char): None for char in remove})

        if comp_word == word:
            number += 1

    return number

#Function to get the proper lemma 
import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    from nltk.corpus import wordnet

    if treebank_tag.startswith('JJ'):
        return wordnet.ADJ
    elif treebank_tag.startswith('VB'):
        return wordnet.VERB
    elif treebank_tag.startswith('NN'):
        return wordnet.NOUN
    elif treebank_tag.startswith('RB'):
        return wordnet.ADV
    else:
        return None

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatiser(row):

    word = row['phrase']
    pos = row['pos']

    try:
        lemma = wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        return lemma
    except:
        try:
            lemma = wordnet_lemmatizer.lemmatize(word)
            return lemma
        except:
            print(word)




mrc_features = pd.read_csv('corpus/MRC.csv', names=('id', 'NPHN', 'KFFRQ', 'KFCAT', 'KFSMP', 'T-LFRQ', 'FAM', 'CNC', 'IMG', 'AOA', 'word'), low_memory=False)





def aoa(word):
    word = word.upper()  # Convert word to all capitals
    try:
        df = mrc_features.loc[mrc_features['word'] == word]
        fvalue = df.iloc[0]['AOA']
        return fvalue    
    except:
        return 0


def CNC_fun(word):
    word = word.upper()
    table = mrc_features[mrc_features['word']==word]

    if len(table)>0:

        CNC = table['CNC'].values[0]
        CNC = int(CNC)

        return CNC
    else: 
        y=0
        return y

def img(word):
    word = word.upper()
    try:
        df = mrc_features.loc[mrc_features['word'] == word]
        fvalue = df.iloc[0]['IMG']
        return fvalue    
    except:
        return 0







def KFCAT_fun(word):
        word = word.upper()
        table = mrc_features[mrc_features['word']==word]

        if len(table)>0:

            KFCAT = table['KFCAT'].values[0]
            KFCAT = int(KFCAT)

            return KFCAT
        else: 
            y=0
            return y

def FAM_fun(word):
        word = word.upper()
        table = mrc_features[mrc_features['word']==word]

        if len(table)>0:

            FAM = table['FAM'].values[0]
            FAM = int(FAM)

            return FAM
        else: 
            y=0
            return y

def KFSMP_fun(word):
        word = word.upper()
        table = mrc_features[mrc_features['word']==word]

        if len(table)>0:

            KFSMP = table['KFSMP'].values[0]
            KFSMP = int(KFSMP)

            return KFSMP
        else: 
            y=0
            return y

def KFFRQ_fun(word):
        word = word.upper()
        table = mrc_features[mrc_features['word']==word]

        if len(table)>0:

            KFFRQ = table['KFFRQ'].values[0]
            KFFRQ = int(KFFRQ)

            return KFFRQ
        else: 
            y=0
            return y

# def NLET_fun(word):
#         word = word.upper()
#         table = mrc_features[mrc_features['word']==word]

#         if len(table)>0:


#             NLET = table['NLET'].values[0]
#             NLET = int(NLET)

#             return NLET
#         else: 
#             y=0
#             return y

def NPHN_fun(word):
        word = word.upper()
        table = mrc_features[mrc_features['word']==word]

        if len(table)>0:

            NPHN = table['NPHN'].values[0]
            NPHN = int(NPHN)

            return NPHN
        else: 
            y=0
            return y

def TLFRQ_fun(word):
        word = word.upper()
        table = mrc_features[mrc_features['word']==word]

        if len(table)>0:

            TLFRQ = table['T-LFRQ'].values[0]
            TLFRQ = int(TLFRQ)

            return TLFRQ
        else: 
            y=0
            return y


In [76]:
2#

# Create an empty dictionary from the words in the 'split' column of MWEs dataframe
word_dict = {word: None for sublist in MWEs['split'] for word in sublist}
def create_dict(row):
    return {word: None for word in row}

# Create a copy of the MWEs dataframe to avoid SettingWithCopyWarning
MWEs_copy = MWEs.copy()

MWEs_copy['feat_dict'] = MWEs_copy['split'].apply(create_dict)


def create_dict(row):
    word_dict = {word: None for word in row['split']}
    
    return word_dict

# Apply the function to each row in the MWEs dataframe
MWEs_copy = MWEs.copy()
MWEs_copy['feat_dict'] = MWEs_copy.apply(create_dict, axis=1)

def update_dict(row):
    for word in row['split']:
        word_dict = row['feat_dict']
        word_dict[word] = {
            'AOA': aoa(word),
            'CNC': CNC_fun(word),
            'IMG': img(word),
            'KFCAT': KFCAT_fun(word),
            'FAM': FAM_fun(word),
            'KFSMP': KFSMP_fun(word),
            'KFFRQ': KFFRQ_fun(word),
            'NPHN': NPHN_fun(word),
            'TLFRQ': TLFRQ_fun(word)
        }
    return word_dict

MWEs_copy['feat_dict'] = MWEs_copy.apply(update_dict, axis=1)

In [80]:
for i in range(3):
    print(MWEs_copy['feat_dict'].iloc[i])
    print()

{'sparking': {'AOA': '0', 'CNC': 0, 'IMG': '0', 'KFCAT': 0, 'FAM': 0, 'KFSMP': 0, 'KFFRQ': 0, 'NPHN': 0, 'TLFRQ': 0}, 'intense': {'AOA': '0', 'CNC': 361, 'IMG': '340', 'KFCAT': 10, 'FAM': 490, 'KFSMP': 37, 'KFFRQ': 40, 'NPHN': 0, 'TLFRQ': 94}}

{'sparking': {'AOA': '0', 'CNC': 0, 'IMG': '0', 'KFCAT': 0, 'FAM': 0, 'KFSMP': 0, 'KFFRQ': 0, 'NPHN': 0, 'TLFRQ': 0}, 'intense': {'AOA': '0', 'CNC': 361, 'IMG': '340', 'KFCAT': 10, 'FAM': 490, 'KFSMP': 37, 'KFFRQ': 40, 'NPHN': 0, 'TLFRQ': 94}, 'clashes': {'AOA': '0', 'CNC': 0, 'IMG': '0', 'KFCAT': 0, 'FAM': 0, 'KFSMP': 0, 'KFFRQ': 0, 'NPHN': 0, 'TLFRQ': 0}}

{'intense': {'AOA': '0', 'CNC': 361, 'IMG': '340', 'KFCAT': 10, 'FAM': 490, 'KFSMP': 37, 'KFFRQ': 40, 'NPHN': 0, 'TLFRQ': 94}, 'clashes': {'AOA': '0', 'CNC': 0, 'IMG': '0', 'KFCAT': 0, 'FAM': 0, 'KFSMP': 0, 'KFFRQ': 0, 'NPHN': 0, 'TLFRQ': 0}}



In [88]:
MWEs

Unnamed: 0,ID,sentence,start_index,end_index,phrase,total_native,total_non_native,native_complex,non_native_complex,complex_binary,complex_probabilistic,split,count,feat_dict
6,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,67,sparking intense,10,10,0,1,1,0.05,"[sparking, intense]",2,"{'sparking': None, 'intense': None}"
7,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,75,sparking intense clashes,10,10,1,1,1,0.10,"[sparking, intense, clashes]",3,"{'sparking': None, 'intense': None, 'clashes':..."
9,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,60,75,intense clashes,10,10,0,2,1,0.10,"[intense, clashes]",2,"{'intense': None, 'clashes': None}"
12,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,86,102,bloodied victims,10,10,0,1,1,0.05,"[bloodied, victims]",2,"{'bloodied': None, 'victims': None}"
28,3Z8UJEJOCZEG603II1EL4BE2PV593A,"The violence in Rastan, in the restive central...",109,133,internationally brokered,10,10,1,0,1,0.05,"[internationally, brokered]",2,"{'internationally': None, 'brokered': None}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,Banks have until the end of the year to move t...,89,111,asset management firms,10,10,4,0,1,0.20,"[asset, management, firms]",3,"{'asset': None, 'management': None, 'firms': N..."
1733,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,Banks have until the end of the year to move t...,118,127,fire sale,10,10,3,1,1,0.20,"[fire, sale]",2,"{'fire': None, 'sale': None}"
1736,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,Banks have until the end of the year to move t...,146,161,Luis de Guindos,10,10,0,0,0,0.00,"[Luis, de, Guindos]",3,"{'Luis': None, 'de': None, 'Guindos': None}"
1752,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,83,95,bank rescues,10,10,0,0,0,0.00,"[bank, rescues]",2,"{'bank': None, 'rescues': None}"


In [None]:


# Add the necessary imports at the top of your script
import json
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from pycorenlp import StanfordCoreNLP

# Initiate the CoreNLP server connection
nlp = StanfordCoreNLP('http://localhost:9000')


# Define a function to update the dictionary with these new features
def update_dict_with_parsed_data(row):
    word_dict = row['feat_dict']
    parse_data = row['parse']  # Use the 'parse' column from the DataFrame
    
    # If the parse data is a string in JSON format, convert it to a dictionary
    if isinstance(parse_data, str):
        parse_data = json.loads(parse_data)
    
    for word in row['split']:
        # Get the existing word features
        word_features = word_dict[word]
        
        # Update the dictionary with the additional features
        word_features.update({
            'AOA': aoa(word),
            'CNC': CNC_fun(word),
            'IMG': img(word),
            'KFCAT': KFCAT_fun(word),
            'FAM': FAM_fun(word),
            'KFSMP': KFSMP_fun(word),
            'KFFRQ': KFFRQ_fun(word),
            'NPHN': NPHN_fun(word),
            'TLFRQ': TLFRQ_fun(word),
            'parse': parse_data,  # This is now the parse data from the DataFrame
            'pos': get_pos(row),
            'dep': get_dep(row),
            'lemma': lemmatiser(row)
        })
        
    return word_dict

# Apply this function to each row in the DataFrame
MWEs_copy['feat_dict'] = MWEs_copy.apply(update_dict_with_parsed_data, axis=1)




In [56]:
import pandas as pd
import numpy as np
import string
import regex as re
import json
import os

# Read the .pkl file into a DataFrame
file_path = 'cwishareddataset/traindevset/english/pickled-dataframes/News_Dev.pkl'
data_frame = pd.read_pickle(file_path)

data_frame.columns = ['ID', 'sentence', 'start_index', 'end_index', 'phrase', 'total_native', 'total_non_native', 'native_complex', 'non_native_complex', 'complex_binary', 'complex_probabilistic']

# Perform data processing
data_frame['split'] = data_frame['phrase'].apply(lambda x: x.split())
data_frame['count'] = data_frame['split'].apply(lambda x: len(x))

words = data_frame[data_frame['count'] == 1]
MWEs = data_frame[data_frame['count'] > 1]

word_set = words.phrase.str.lower().unique()
word_set = pd.DataFrame(word_set, columns=['phrase'])

remove = string.punctuation.replace("-", "").replace("'", "") + '“”'
pattern = r"[{}]".format(remove)
word_set['phrase'] = word_set['phrase'].apply(lambda x: x.translate({ord(char): None for char in remove}))

# Create an empty dictionary for each word in MWEs DataFrame
import pandas as pd



word_dicts = []

for _, row in MWEs.iterrows():
    mwe = row['split']
    word_dict = {}
    for word in mwe.split():
        # Create an empty dictionary for the word
        word_dict[word] = {}
    
    word_dicts.append(word_dict)

keys = [
    "parse",
    "total_non_native",
    "native_complex",
    "IMG",
    "sub_imdb",
    "google frequency",
    "KFCAT",
    "FAM",
    "KFSMP",
    "KFFRQ",
    "AOA",
    "NPHN",
    "T-LFRQ"
]

result = []

for word_dict in word_dicts:
    word_result = {}
    for word in word_dict:
        word_dict[word] = {key: "" for key in keys}
        word_result[word] = word_dict[word]
    result.append(word_result)

print(result)

    


    

AttributeError: 'list' object has no attribute 'split'

In [61]:
import pandas as pd

# Assuming you have a DataFrame named MWEs with a column named "split"

keys = [
    "parse",
    "total_non_native",
    "native_complex",
    "IMG",
    "sub_imdb",
    "google frequency",
    "KFCAT",
    "FAM",
    "KFSMP",
    "KFFRQ",
    "AOA",
    "NPHN",
    "T-LFRQ"
]

word_dicts = []

for _, row in MWEs.iterrows():
    if row['split']
        mwe = row['split']
        for word in mwe.split():
            word_dict = {key: "" for key in keys}
            word_dicts.append({word: word_dict})

print(word_dicts)


SyntaxError: invalid syntax (611975966.py, line 24)

In [24]:
features = ['parse', 'phrase', 'total_native', 'total_non_native',
            'native_complex', 'IMG', 'sub_imdb', 'google frequency', 'KFCAT', 'FAM', 'KFSMP', 'KFFRQ',
            'AOA', 'NPHN', 'T-LFRQ']

for word in word_dicts:
    word_dicts[word] = {feature: None for feature in features}
# Update the "word_dict" column of the dataframe
MWEs['word_dict'] = MWEs['word_dict'].apply(lambda x: word_dict if x == word_dict else x)

TypeError: list indices must be integers or slices, not dict

In [29]:
import pandas as pd

# Assuming MWEs is the dataframe with a column called "word_dict"
# And word_dict is the modified dictionary
features = ['parse', 'phrase', 'total_native', 'total_non_native',
            'native_complex', 'IMG', 'sub_imdb', 'google frequency', 'KFCAT', 'FAM', 'KFSMP', 'KFFRQ',
            'AOA', 'NPHN', 'T-LFRQ']

for word in word_dicts:
    word_dicts[word] = {feature: None for feature in features}


TypeError: list indices must be integers or slices, not dict

In [26]:
MWEs

Unnamed: 0,ID,sentence,start_index,end_index,phrase,total_native,total_non_native,native_complex,non_native_complex,complex_binary,complex_probabilistic,split,count,word_dict
6,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,67,sparking intense,10,10,0,1,1,0.05,"[sparking, intense]",2,"{'sparking': {}, 'intense': {}, 'parse': None,..."
7,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,51,75,sparking intense clashes,10,10,1,1,1,0.10,"[sparking, intense, clashes]",3,"{'sparking': {}, 'intense': {}, 'clashes': {},..."
9,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,60,75,intense clashes,10,10,0,2,1,0.10,"[intense, clashes]",2,"{'intense': {}, 'clashes': {}, 'parse': None, ..."
12,3Z8UJEJOCZEG603II1EL4BE2PV593A,Syrian troops shelled a rebel-held town on Mon...,86,102,bloodied victims,10,10,0,1,1,0.05,"[bloodied, victims]",2,"{'bloodied': {}, 'victims': {}, 'parse': None,..."
28,3Z8UJEJOCZEG603II1EL4BE2PV593A,"The violence in Rastan, in the restive central...",109,133,internationally brokered,10,10,1,0,1,0.05,"[internationally, brokered]",2,"{'internationally': {}, 'brokered': {}, 'parse..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,Banks have until the end of the year to move t...,89,111,asset management firms,10,10,4,0,1,0.20,"[asset, management, firms]",3,"{'asset': {}, 'management': {}, 'firms': {}, '..."
1733,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,Banks have until the end of the year to move t...,118,127,fire sale,10,10,3,1,1,0.20,"[fire, sale]",2,"{'fire': {}, 'sale': {}, 'parse': None, 'phras..."
1736,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,Banks have until the end of the year to move t...,146,161,Luis de Guindos,10,10,0,0,0,0.00,"[Luis, de, Guindos]",3,"{'Luis': {}, 'de': {}, 'Guindos': {}, 'parse':..."
1752,37PGLWGSJT7FDZ4S71CXYQU7J7EIKD,The state will put less than 15 billion euros ...,83,95,bank rescues,10,10,0,0,0,0.00,"[bank, rescues]",2,"{'bank': {}, 'rescues': {}, 'parse': None, 'ph..."


In [27]:
# Get the first 3 rows
first_three = MWEs['word_dict'].head(3)

# Iterate over the dictionaries and print them
for idx, word_dict in first_three.iteritems():
    print(f"Row {idx}:\n{word_dict}\n")

Row 6:
{'sparking': {}, 'intense': {}, 'parse': None, 'phrase': None, 'total_native': None, 'total_non_native': None, 'native_complex': None, 'IMG': None, 'sub_imdb': None, 'google frequency': None, 'KFCAT': None, 'FAM': None, 'KFSMP': None, 'KFFRQ': None, 'AOA': None, 'NPHN': None, 'T-LFRQ': None}

Row 7:
{'sparking': {}, 'intense': {}, 'clashes': {}, 'parse': None, 'phrase': None, 'total_native': None, 'total_non_native': None, 'native_complex': None, 'IMG': None, 'sub_imdb': None, 'google frequency': None, 'KFCAT': None, 'FAM': None, 'KFSMP': None, 'KFFRQ': None, 'AOA': None, 'NPHN': None, 'T-LFRQ': None}

Row 9:
{'intense': {}, 'clashes': {}, 'parse': None, 'phrase': None, 'total_native': None, 'total_non_native': None, 'native_complex': None, 'IMG': None, 'sub_imdb': None, 'google frequency': None, 'KFCAT': None, 'FAM': None, 'KFSMP': None, 'KFFRQ': None, 'AOA': None, 'NPHN': None, 'T-LFRQ': None}



  for idx, word_dict in first_three.iteritems():


In [48]:
import pandas as pd

# File path
file_path = 'final_camb_feats/inspect.csv'

# Read the CSV data file into a pandas DataFrame
data_frame = pd.read_csv(file_path)

# Now you can view the DataFrame
data_frame

Unnamed: 0,sentence,ID,clean sentence,parse,start_index,end_index,phrase,total_native,total_non_native,native_complex,...,IMG,sub_imdb,google frequency,KFCAT,FAM,KFSMP,KFFRQ,AOA,NPHN,T-LFRQ
0,#37-1 Guatemalan Supreme Court approves impeac...,3QREJ3J433YH30CYS49AQ6MZ3G0LKZ,Guatemalan Supreme Court approves impeachment ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",31,39,approves,10,10,0,...,0,1,1.132012,6,0,12,14,0,5,171
1,#37-1 Guatemalan Supreme Court approves impeac...,3QREJ3J433YH30CYS49AQ6MZ3G0LKZ,Guatemalan Supreme Court approves impeachment ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",17,24,supreme,10,10,0,...,0,1,31.112747,11,0,33,51,0,0,139
2,#37-1 Guatemalan Supreme Court approves impeac...,3QREJ3J433YH30CYS49AQ6MZ3G0LKZ,Guatemalan Supreme Court approves impeachment ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",100,107,supreme,10,10,0,...,0,1,31.112747,11,0,33,51,0,0,139
3,#37-1 Guatemalan Supreme Court approves impeac...,3QREJ3J433YH30CYS49AQ6MZ3G0LKZ,Guatemalan Supreme Court approves impeachment ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",25,30,court,10,10,0,...,552,1,122.433514,13,549,64,230,0,0,701
4,#37-1 Guatemalan Supreme Court approves impeac...,3QREJ3J433YH30CYS49AQ6MZ3G0LKZ,Guatemalan Supreme Court approves impeachment ...,"{\n ""sentences"": [\n {\n ""index"": 0,\...",108,113,court,10,10,0,...,552,1,122.433514,13,549,64,230,0,0,701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6777,"#36-6 At 9 P.M. EST, INDYCAR released a statem...",344M16OZKIG450N98VEM53DJOE4ENA,"At 9 P.M. EST, INDYCAR released a statement, a...","{\n ""sentences"": [\n {\n ""index"": 0,\...",89,95,severe,10,10,5,...,352,0,47.073987,12,526,33,39,0,4,119
6778,"#36-6 At 9 P.M. EST, INDYCAR released a statem...",344M16OZKIG450N98VEM53DJOE4ENA,"At 9 P.M. EST, INDYCAR released a statement, a...","{\n ""sentences"": [\n {\n ""index"": 0,\...",96,100,head,10,10,0,...,593,1,188.298221,15,611,190,424,181,0,5047
6779,"#36-6 At 9 P.M. EST, INDYCAR released a statem...",344M16OZKIG450N98VEM53DJOE4ENA,"At 9 P.M. EST, INDYCAR released a statement, a...","{\n ""sentences"": [\n {\n ""index"": 0,\...",16,19,est,10,10,0,...,0,1,14.486005,2,0,3,3,0,0,0
6780,"#36-6 At 9 P.M. EST, INDYCAR released a statem...",344M16OZKIG450N98VEM53DJOE4ENA,"At 9 P.M. EST, INDYCAR released a statement, a...","{\n ""sentences"": [\n {\n ""index"": 0,\...",116,124,critical,10,10,4,...,305,1,76.384984,10,517,42,58,0,8,60
