In [2]:
import os  
import re 
import language_tool_python
import nltk
import pandas as pd
from zemberek import TurkishMorphology, TurkishSentenceNormalizer
from nltk.tokenize import word_tokenize
import openpyxl

# NLTK utils
nltk.download('punkt')  
nltk.download('words')  
nltk.download('names') 


# Load English words and common names from NLTK
english_words = set(nltk.corpus.words.words())
english_names = set(nltk.corpus.names.words())

# Initialize TurkishMorphology and TurkishSentenceNormalizer
morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)

# Function to correct codec awkwardness
def correct_codec_awardness(text):
    replacements = {
        'Å': 'S',  # Ş -> S
        'Å': 's',  # ş -> s
        'Ä±': 'i',  # ı -> i
        'Ä': 'g',  # ğ -> g
        'Ã¼': 'u',  # ü -> u
        'Ã¶': 'o',  # ö -> o
        'Ã§': 'c',  # ç -> c
        'Ä': 'G',  # Ğ -> G
        'Å¸': 'Y',  # Ÿ -> Y
        'Ä': 'c',  # ć -> c
        'Ã': 'O',  # Ö -> O
        'Ã': 'C',  # Ç -> C
        'Ã': 'U',  # Ü -> U
        'â': "'",   # ’ -> '
        'â': '“',  # “ -> “
        'â': '”',  # ” -> ”
        'â': '–',  # – -> –
        'â': '—',  # — -> —
        'â¦': '…',  # … -> …
        'â¬': '€',  # € -> €
    }
    
    for wrong, right in replacements.items():
        text = text.replace(wrong, right)
    return text

# Function to delete words containing weird Turkish characters
def delete_weird_turkish_words(text):
    weird_turkish_chars = r'[Å|Ä|Ã|||||]'
    words = text.split()
    cleaned_words = [word for word in words if not re.search(weird_turkish_chars, word)]
    return ' '.join(cleaned_words)

# Function to convert Turkish characters to English counterparts
def convert_turkish_characters(text):
    turkish_to_english = {
        'ü': 'u',
        'ö': 'o',
        'ş': 's',
        'ç': 'c',
        'ğ': 'g',
        'ı': 'i',
        'İ': 'I',
        'Ü': 'U',
        'Ö': 'O',
        'Ş': 'S',
        'Ç': 'C',
        'Ğ': 'G'
    }
    
    for turkish_char, english_char in turkish_to_english.items():
        text = text.replace(turkish_char, english_char)
    
    return text

# RE function to remove names from the corpus
def remove_names(text, name_list):
    pattern = r'\b(' + '|'.join(re.escape(name) for name in name_list) + r')\b'
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# CleanSweep 2.0 Function
def clean_sweep_2_0(text):
    # Step 1: Replace lines with reviewer comments
    step1_pattern = r"^\s*$\r?\n(?=^\[[a-zA-Z]\][^\n\r]*)"
    text = re.sub(step1_pattern, r"\r\n\r\n$Reviewer Comments$\r\n", text, flags=re.MULTILINE)
    
    # Step 2: Remove everything after $Reviewer Comments$
    step2_pattern = r"\$Reviewer Comments\$(?s).*$"
    text = re.sub(step2_pattern, '', text, flags=re.MULTILINE)
    
    # Step 3: Replace remaining annotations [a], [b], etc., with a single whitespace
    step3_pattern = r"\[[a-zA-Z]\]"
    text = re.sub(step3_pattern, ' ', text)
    
    return text

# PREPROCESSING
def preprocess_corpus(text, name_list):
    # Step 1: Apply CleanSweep 2.0 procedure
    text = clean_sweep_2_0(text)
    
    # Step 2: Correct codec awkwardness
    text = correct_codec_awardness(text)
    
    # Step 3: Delete words containing weird Turkish characters
    text = delete_weird_turkish_words(text)
    
    # Step 4: Convert Turkish characters to English counterparts
    text = convert_turkish_characters(text)
    
    # Step 5: Remove names from the corpus
    text = remove_names(text, name_list)
    
    return text

# LANGTOOL BEGINS HERE - NO WHITESPACE TYPOGRAPHY OR LOCALE VIOLATION
def count_grammatical_errors(tool, text):
    matches = tool.check(text)
    filtered_matches = [
        match for match in matches
        if match.ruleIssueType not in ['whitespace', 'typographical', 'locale-violation'] and
           not "is British English" in match.message and
           not match.context[match.offsetInContext:match.offsetInContext + match.errorLength].strip().lower() in ['turkiye', 'türkiye']
    ]
    return len(filtered_matches), filtered_matches

# Function to count words in the text
def count_words(text):
    words = word_tokenize(text)
    return len(words), words

# Function to check if a word is Turkish using Zemberek
def is_turkish(word):
    try:
        normalized_word = normalizer.normalize(word)
        analysis = morphology.analyze(normalized_word)
        for result in analysis:
            if not result.is_unknown():
                return True
    except Exception as e:
        print(f"Error processing word '{word}': {e}")
    return False

# Function to extract the error word(s) from a match
def extract_error_word(match):
    context = match.context
    error_start = match.offsetInContext
    error_end = error_start + match.errorLength
    error_word = context[error_start:error_end].strip()
    
    # If the error word contains a hyphen, split it into separate words
    if '-' in error_word:
        return error_word.split('-')
    return [error_word]

# Function to process all text files in the directory with preprocessing and analysis
def process_corpus_directory_with_analysis(directory, name_list):
    tool = language_tool_python.LanguageTool('en-US')
    
    # Summary data and error details will be stored in pandas dataframes
    summary_data = []
    error_details_list = []

    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='iso-8859-9') as file:
                text = file.read()

                # Preprocess the text
                text = preprocess_corpus(text, name_list)

                # Perform grammatical error analysis
                num_errors, matches = count_grammatical_errors(tool, text)
                num_words, words = count_words(text)
                error_to_word_ratio = num_errors / num_words if num_words > 0 else 0

                # Store the summary data for each file
                summary_data.append({
                    'Filename': filename,
                    'NumErrors': num_errors,
                    'NumWords': num_words,
                    'ErrorToWordRatio': error_to_word_ratio
                })

                # Store error details for each match
                for match in matches:
                    error_words = extract_error_word(match)
                    if not error_words:
                        error_words = ["[Unable to extract]"]
                    for error_word in error_words:
                        error_details_list.append({
                            'Filename': filename,
                            'Sentence': match.sentence,
                            'Error': error_word,
                            'Category': match.ruleIssueType,
                            'Message': match.message
                        })

    # Convert to pandas DataFrames
    summary_df = pd.DataFrame(summary_data)
    error_details_df = pd.DataFrame(error_details_list)

    # Manipulate the data: Remove rows where the error word is Turkish
    updated_summary = {}

    for filename in summary_df['Filename']:
        df_file_errors = error_details_df[error_details_df['Filename'] == filename]
        num_words = summary_df.loc[summary_df['Filename'] == filename, 'NumWords'].values[0]
        
        rows_to_drop = df_file_errors[df_file_errors['Error'].apply(is_turkish)].index
        df_file_errors.drop(rows_to_drop, inplace=True)

        # Calculate updated NumErrors and ErrorToWordRatio
        updated_num_errors = df_file_errors.shape[0]
        updated_error_to_word_ratio = updated_num_errors / num_words if num_words > 0 else 0
        
        # Store the updated NumErrors and ErrorToWordRatio
        updated_summary[filename] = {
            'NumErrors': updated_num_errors,
            'ErrorToWordRatio': updated_error_to_word_ratio
        }

        # Replace the error details for the file in the main dataframe
        error_details_df = error_details_df.drop(error_details_df[error_details_df['Filename'] == filename].index)
        error_details_df = pd.concat([error_details_df, df_file_errors])

    # Update the summary DataFrame
    for filename, data in updated_summary.items():
        summary_df.loc[summary_df['Filename'] == filename, 'NumErrors'] = data['NumErrors']
        summary_df.loc[summary_df['Filename'] == filename, 'ErrorToWordRatio'] = data['ErrorToWordRatio']

    # Save to Excel
    output_excel = os.path.join(directory, 'detailed_analysis.xlsx')
    with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
        summary_df.to_excel(writer, sheet_name="Summary", index=False)
        error_details_df.to_excel(writer, sheet_name="Error Details", index=False)

    print(f"Detailed analysis saved to {output_excel}")

# PATH
folder_path = r'C:\Users\Egemen\Desktop\diss\Diss Stuff\[2] Pretest'
name_list = ['aleyna kaymak', 'nida alışık', 'şenay tatlı', 'aleyna kaymak', 'helin tals', 'abidin yıldırım', 'emine kahraman', 'sümeyye çakmak', 'elif okumuş', 'özlem doğan', 'emine kahraman', 'eren avcı', 'osman şen', 'kübra değirmenci', 'kübra köse', 'sura ural', 'hicran helin talş', 'melis eren', 'ceren erkek', 'dilara', 'mehmetali andaç beylikçi', 'nupelda kandemir', 'sanem İzci', 'zehra esmer', 'ayşen köse', 'demet toprak', 'fadime değirmenci', 'bilge nur ezmez', 'celal samyurek', 'ezgi samyurek', 'yusuf buğra kılıç', 'İrem kaya', 'nagehan karacaoğlu', 'hicran helin talş', 'hatice çelen', 'berfin yelken', 'umay kuşcu', 'abidin yıldırım', 'öykü çepelli', 'cahit mert tümen', 'semih çalışkan', 'beyza bayrak', 'sıla lük', 'eda nur yardım', 'fatma ördek', 'nida akkuş', 'merve er', 'ayşe doğan', 'büşra bacak', 'merve er', 'özlem doğan', 'burak şahin', 'nilsu zeren', 'fatih hasoğlu', 'zahide kara', 'mihriban koç', 'ecem ceren çevik', 'sıla ercan', 'dilara gülbağlar', 'ayşe sapmaz', 'halime oğurer', 'melis üçkan', 'buse azgın', 'merve eroğlu', 'emircan parlak', 'hilal şeker', 'selin keskin', 'büşra yücel', 'turancan çelikay', 'özgenur kısa', 'mustafa güloğlu', 'öznur kaya', 'ecem ceren çevik', 'elanur günüç', 'aleyna boy', 'deniz ortaçbayram', 'havva aydın', 'tuğçe özdemir', 'songül begit', 'esma yerebakan', 'ece berfu kaya', 'özge erkaya', 'ülkü sahra karaca', 'sema karasu', 'dilay ada', 'zehra öztürk', 'kardelen avcı', 'halil umut tez', 'halil umut tez', 'hasan mert menteş', 'selin yücelbulut', 'süeda öntürkler', 'egemen curuk', 'arın çağla kırmızıçiçek', 'engin can yayla', 'aleyna kaymak', 'nida alisik', 'senay tatli', 'aleyna kaymak', 'abidin yildirim', 'emine kahraman', 'sumeyye cakmak', 'elif okumus', 'ozlem dogan', 'emine kahraman', 'eren avci', 'osman sen', 'kubra degirmenci', 'kubra kose', 'sura ural', 'hicran helin talsh', 'melis eren', 'ceren erkek', 'dilara', 'mehmetali andac beylikci', 'nupelda kandemir', 'sanem Izci', 'zehra esmer', 'ayshen kose', 'demet toprak', 'fadime degirmenci', 'bilge nur ezmez', 'celal samyurek', 'ezgi samyurek', 'yusuf bugra kilic', 'Irem kaya', 'nagehan karacaoglu', 'hicran helin talsh', 'hatice celen', 'berfin yelken', 'umay kuscu', 'abidin yildirim', 'oyku cepelli', 'cahit mert tuman', 'semih caliskan', 'beyza bayrak', 'sila luk', 'eda nur yardim', 'fatma ordek', 'nida akkus', 'merve er', 'ayse dogan', 'busra bacak', 'merve er', 'ozlem dogan', 'burak sahin', 'nilsu zeren', 'fatih hasoglu', 'zahide kara', 'mihriban koc', 'ecem ceren cevik', 'sila ercan', 'dilara gulbaglar', 'ayse sapmaz', 'halime ogurer', 'melis uckan', 'buse azgin', 'merve eroglu', 'emircan parlak', 'hilal seker', 'selin keskin', 'busra yucel', 'turancan celikay', 'ozgenur kisa', 'mustafa guloglu', 'oznur kaya', 'ecem ceren cevik', 'elanur gunuc', 'aleyna boy', 'deniz ortacbayram', 'havva aydin', 'tugce ozdemir', 'songul begit', 'esma yerebakan', 'ece berfu kaya', 'ozge erkaya', 'ulku sahra karaca', 'sema karasu', 'dilay ada', 'zehra ozturk', 'kardelen avci', 'halil umut tez', 'halil umut tez', 'hasan mert mentes', 'selin yucelbulut', 'sueda onturkl', 'egemen curuk', 'arin cagla kirmizicicek', 'engin can yayla', 'aleyna', 'nida', 'senay', 'emine', 'sumeyye', 'elif', 'ozlem', 'eren', 'osman', 'sura', 'hicran', 'melis', 'ceren', 'dilara', 'mehmetali', 'nupelda', 'sanem', 'zehra', 'ayshen', 'demet', 'fadime', 'bilge', 'celal', 'ezgi', 'yusuf', 'Irem', 'nagehan', 'hatice', 'berfin', 'umay', 'oyku', 'cahit', 'semih', 'beyza', 'eda', 'fatma', 'nida', 'merve', 'ayse', 'busra', 'burak', 'nilsu', 'fatih', 'zahide', 'mihriban', 'ecem', 'sila', 'halime', 'buse', 'emircan', 'hilal', 'selin', 'turancan', 'ozgenur', 'mustafa', 'oznur', 'elanur', 'deniz', 'havva', 'tugce', 'songul', 'esma', 'ece', 'ozge', 'ulku', 'sema', 'dilay', 'kardelen', 'halil', 'hasan', 'sueda', 'arin', 'engin', 'kaymak', 'alisik', 'tatli', 'yildirim', 'kahraman', 'cakmak', 'okumus', 'dogan', 'sen', 'degirmenci', 'kose', 'ural', 'talsh', 'erkek', 'beylikci', 'kandemir', 'Izci', 'esmer', 'kose', 'toprak', 'ezmez', 'samyurek', 'kilic', 'karacaoglu', 'celen', 'yelken', 'kuscu', 'cepelli', 'tuman', 'caliskan', 'bayrak', 'luk', 'yardim', 'ordek', 'akkus', 'sahin', 'zeren', 'hasoglu', 'kara', 'koc', 'cevik', 'ercan', 'gulbaglar', 'sapmaz', 'ogurer', 'uckan', 'azgin', 'eroglu', 'parlak', 'seker', 'keskin', 'yucel', 'celikay', 'kisa', 'guloglu', 'kaya', 'gunuc', 'ortacbayram', 'aydin', 'ozdemir', 'begit', 'yerebakan', 'erkaya', 'karaca', 'karasu', 'ada', 'ozturk', 'avci', 'tez', 'mentes', 'yucelbulut', 'onturkl', 'curuk', 'kirmizicicek', 'yayla', 'mehmetali', 'nupelda', 'sanem', 'ayshen', 'ayse', 'eda', 'burak', 'nilsu', 'mihriban', 'ecem', 'sila', 'halime', 'buse', 'emircan', 'engin', 'arin', 'egemen', 'curuk']
process_corpus_directory_with_analysis(folder_path, name_list)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Egemen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Egemen\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Egemen\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
TurkishMorphology instance initialized in 15.337981462478638


2024-09-06 14:18:02,789 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 15.337981462478638



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Detailed analysis saved to C:\Users\Egemen\Desktop\diss\Diss Stuff\[2] Pretest\detailed_analysis.xlsx
