In [2]:
import os
import pandas as pd
import numpy as np
import pickle

In [16]:
# split the feature files into non-native and native annoations. If a phrase was selected as complex 


def split_pkl_files(directory_path):
    # Get all .pkl files in the directory
    files = [f for f in os.listdir(directory_path) if f.endswith('_Final')]

    for file in files:
        # Create full file path
        file_path = os.path.join(directory_path, file)
        with open(file_path, 'rb') as f:
            df = pickle.load(f)
        
        # Split DataFrame into 'Native' and 'Non-Native' parts
        # Make sure 'Native' doesn't contain any value other than 0 in the 'non_native_complex' column
        df_native = df[(df['native_complex'].notnull()) & ((df['non_native_complex'].isnull()) | (df['non_native_complex'] == 0))]
        
        # Make sure 'Non_Native' doesn't contain any value other than 0 in the 'native_complex' column
        df_non_native = df[(df['non_native_complex'].notnull()) & ((df['native_complex'].isnull()) | (df['native_complex'] == 0))]

        # Create new file names
        base_file_name = file.rsplit('.', 1)[0]
        native_file_name = 'Native_' + base_file_name
        non_native_file_name = 'Non_Native_' + base_file_name
        
        # Create subdirectories if they don't exist
        native_csv_subdir = os.path.join(directory_path, 'Native', 'CSV')
        non_native_csv_subdir = os.path.join(directory_path, 'Non_Native', 'CSV')
        os.makedirs(native_csv_subdir, exist_ok=True)
        os.makedirs(non_native_csv_subdir, exist_ok=True)

        # Save the split DataFrames to new .csv files within the subfolders
        native_csv_file_path = os.path.join(native_csv_subdir, native_file_name + '.csv')
        non_native_csv_file_path = os.path.join(non_native_csv_subdir, non_native_file_name + '.csv')
        
        df_native.to_csv(native_csv_file_path, index=False)
        df_non_native.to_csv(non_native_csv_file_path, index=False)

        # Save the split DataFrames back to new .pkl files in the current directory
        native_pkl_file_path = os.path.join(directory_path, native_file_name + '.pkl')
        non_native_pkl_file_path = os.path.join(directory_path, non_native_file_name + '.pkl')
        
        with open(native_pkl_file_path, 'wb') as f:
            pickle.dump(df_native, f)
            
        with open(non_native_pkl_file_path, 'wb') as f:
            pickle.dump(df_non_native, f)


# Call the function with the directory containing your .pkl files
split_pkl_files('../camb_model/cwi_2018-master/final_camb_feats_Test')

In [15]:
!pwd

/Users/adamtucker/Desktop/CWI_masters/Camb_A


In [9]:
# This works for A1 EFCAMDAT data 

# Write seperate functionsfor each level get_A1_freq 


import os
import pandas as pd

def get_cefr_freq_A1(folder_path, phrase_column_name='phrase', text_corrected_column_name='text_corrected'):
    # Get the list of .pkl files in the folder
    pkl_files = [file for file in os.listdir(folder_path) if file.endswith('.pkl')]

    # Load the data from "Corpus/EFCAMDAT_A1.pkl" to get the phrase frequency
    path_to_corpus = os.path.join(folder_path, '../Corpus/EFCAMDAT_A1.pkl')
    corpus_data = pd.read_pickle(path_to_corpus)

    # Create a dictionary to store the word frequencies from the "text_corrected" column
    word_freq_dict = dict(corpus_data[text_corrected_column_name].str.split(expand=True).stack().value_counts())

    # Iterate through each .pkl file and add the frequency column
    for pkl_file in pkl_files:
        if pkl_file == 'EFCAMDAT_A1.pkl':
            # Skip the "Corpus/EFCAMDAT_A1.pkl" file since it doesn't need any frequency counting
            continue

        file_path = os.path.join(folder_path, pkl_file)
        df = pd.read_pickle(file_path)

        # Count the occurrences of each word from the "phrase" column in the "text_corrected" column
        df['A1_freq'] = df[phrase_column_name].apply(lambda x: sum(word_freq_dict.get(word, 0) for word in x.split()))

        # Save the modified DataFrame back to the .pkl file with "_EFCAMDAT" added to the filename
        new_file_path_pkl = file_path.replace(".pkl", "_EFCAMDAT.pkl")
        df.to_pickle(new_file_path_pkl)

        # Save the modified DataFrame as a .csv file with "_EFCAMDAT" added to the filename
        new_file_path_csv = file_path.replace(".pkl", "_EFCAMDAT.csv")
        df.to_csv(new_file_path_csv, index=False)

if __name__ == "__main__":
    folder_path = "Lang8_feat_test/"
    get_cefr_freq(folder_path)

In [28]:
#Function to return frequency of target phrase in each cefr corpus

import os
import pandas as pd

def get_cefr_freq_A1(folder_path, phrase_column_name='phrase', text_corrected_column_name='text_corrected'):
    return get_cefr_freq(folder_path, phrase_column_name, text_corrected_column_name, 'EFCAMDAT_A1.pkl', 'A1_freq')

def get_cefr_freq_A2(folder_path, phrase_column_name='phrase', text_corrected_column_name='text_corrected'):
    return get_cefr_freq(folder_path, phrase_column_name, text_corrected_column_name, 'EFCAMDAT_A2.pkl', 'A2_freq')

def get_cefr_freq_B1(folder_path, phrase_column_name='phrase', text_corrected_column_name='text_corrected'):
    return get_cefr_freq(folder_path, phrase_column_name, text_corrected_column_name, 'EFCAMDAT_B1.pkl', 'B1_freq')

def get_cefr_freq_B2(folder_path, phrase_column_name='phrase', text_corrected_column_name='text_corrected'):
    return get_cefr_freq(folder_path, phrase_column_name, text_corrected_column_name, 'EFCAMDAT_B2.pkl', 'B2_freq')

def get_cefr_freq_C1(folder_path, phrase_column_name='phrase', text_corrected_column_name='text_corrected'):
    return get_cefr_freq(folder_path, phrase_column_name, text_corrected_column_name, 'EFCAMDAT_C1.pkl', 'C1_freq')

def get_cefr_freq(folder_path, phrase_column_name, text_corrected_column_name, cefr_file, cefr_freq_column):
    # Load the data from the specified cefr_file to get the phrase frequency
    path_to_corpus = os.path.join(folder_path, '../Corpus', cefr_file)
    corpus_data = pd.read_pickle(path_to_corpus)

    # Create a dictionary to store the word frequencies from the "text_corrected" column
    word_freq_dict = dict(corpus_data[text_corrected_column_name].str.split(expand=True).stack().value_counts())

    # Initialize an empty DataFrame to store the merged results
    merged_df = pd.DataFrame()

    # Iterate through each .pkl file and add the frequency column
    for pkl_file in os.listdir(folder_path):
        if pkl_file.endswith('.pkl') and pkl_file != cefr_file:
            file_path = os.path.join(folder_path, pkl_file)
            df = pd.read_pickle(file_path)

            # Count the occurrences of each word from the "phrase" column in the "text_corrected" column
            df[cefr_freq_column] = df[phrase_column_name].apply(lambda x: sum(word_freq_dict.get(word, 0) for word in x.split()))

            # Add the DataFrame to the merged DataFrame
            merged_df = pd.concat([merged_df, df], ignore_index=True)

    # Save the merged DataFrame to a single .pkl file
    merged_file_path_pkl = os.path.join(folder_path, f"merged_{cefr_file}")
    merged_df.to_pickle(merged_file_path_pkl)

    # Save the merged DataFrame to a single .csv file
    merged_file_path_csv = os.path.join(folder_path, f"merged_{cefr_file.replace('.pkl', '.csv')}")
    merged_df.to_csv(merged_file_path_csv, index=False)

# Usage example:
if __name__ == "__main__":
    folder_path = "Lang8_feat_test/"
    get_cefr_freq_A1(folder_path)
    get_cefr_freq_A2(folder_path)
    get_cefr_freq_B1(folder_path)
    get_cefr_freq_B2(folder_path)
    get_cefr_freq_C1(folder_path)
