In [None]:
!pip install language-tool-python

In [None]:
import language_tool_python
import pandas as pd
import matplotlib.pyplot as plt

# Error analysis in a sample text

In [None]:
import language_tool_python
from collections import Counter

# Initialize LanguageTool
tool = language_tool_python.LanguageTool('en-US')

# Sample text with different kinds of issues
sample_text = """
This are an example of a bad sentence. It do not make much sense.
There is alot of mistakes, like misspelings and bad style.
I seen that before, its not unpossible.
"""

# Check the text
matches = tool.check(sample_text)

# Extract and count error types
error_types = [match.ruleIssueType for match in matches]
error_counts = Counter(error_types)

# Print all unique types and their counts
print("Detected Error Types:")
for error_type, count in error_counts.items():
    print(f"{error_type}: {count}")


In [None]:
text = "This are incorrect sentence with mistake."
matches = tool.check(text)
print(f"{len(matches)} issues found.")
for match in matches:
    print(match.ruleId, ":", match.message)

# Error analysis in the raw essay files

In [None]:
# Initialize the grammar checking tool
tool = language_tool_python.LanguageTool('en-US')

# === Step 1: Load Raw Essay Files ===
def load_essays(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        essays = f.read().strip().split('\n')
    return pd.DataFrame({'Essay Text': essays})

df_ai = load_essays(r"C:\Users\...\AI_essays.txt")
df_human = load_essays(r"C:\Users\...\human_essays.txt")


In [None]:
# === Step 2: Analyze Errors in One Essay ===
def analyze_errors(text):
    matches = tool.check(text)
    error_count = len(matches)
    words = len(text.split())
    errors_per_100_words = (error_count / words) * 100 if words > 0 else 0
    error_types = [match.ruleIssueType for match in matches]
    return error_count, errors_per_100_words, error_types


In [None]:
# === Step 3: Summarize Errors for a Dataset ===
def error_summary(df):
    total_errors = 0
    total_words = 0
    error_type_counter = {}

    for text in df['Essay Text']:
        count, _, types = analyze_errors(text)
        total_errors += count
        total_words += len(text.split())
        for et in types:
            error_type_counter[et] = error_type_counter.get(et, 0) + 1

    overall_errors_per_100 = (total_errors / total_words) * 100 if total_words > 0 else 0
    return total_errors, overall_errors_per_100, error_type_counter

In [None]:
# === Step 4: Run Error Analysis on Both Datasets ===
ai_errors, ai_err_rate, ai_err_types = error_summary(df_ai)
print(f"AI Essays: {ai_errors} total errors, {ai_err_rate:.2f} errors per 100 words")

human_errors, human_err_rate, human_err_types = error_summary(df_human)
print(f"Human Essays: {human_errors} total errors, {human_err_rate:.2f} errors per 100 words")

In [None]:
# === Step 5: Compare Error Types ===
error_df = pd.DataFrame([ai_err_types, human_err_types], index=['AI Essays', 'Human Essays']).fillna(0).astype(int).T
error_df['Difference'] = error_df['AI Essays'] - error_df['Human Essays']
error_df_sorted = error_df.sort_values(by='Difference', ascending=False)

# Normalized Stacked Bar Chart
error_percent = error_df_sorted[['AI Essays', 'Human Essays']].div(
    error_df_sorted[['AI Essays', 'Human Essays']].sum(axis=0), axis=1
)
error_percent.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='tab10')
plt.title('Proportion of Error Types in AI vs. Human Essays')
plt.ylabel('Proportion')
plt.xlabel('Error Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# === Step 6: Visualizations ===

# Bar Chart of Error Types
error_df_sorted[['AI Essays', 'Human Essays']].plot(kind='bar', figsize=(12,6))
plt.title('Frequency of Error Types in AI vs. Human Essays')
plt.ylabel('Count')
plt.xlabel('Error Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Error analysis in the first 5 essays, chunks of the whole file

In [None]:
import language_tool_python
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
tool = language_tool_python.LanguageTool('en-US')

In [None]:
def load_essays(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        essays = f.read().strip().split('\n')
    return pd.DataFrame({'Essay Text': essays})
    
# Load only first 5 essays from each
df_ai = load_essays(r"C:\Users\fatim\OneDrive\Bilder\Skrivebord\text mining\AI_essays.txt").head(5)
df_human = load_essays(r"C:\Users\fatim\OneDrive\Bilder\Skrivebord\text mining\human_essays.txt").head(5)


In [None]:
def analyze_errors(text):
    matches = tool.check(text)
    error_count = len(matches)
    words = len(text.split())
    errors_per_100_words = (error_count / words) * 100 if words > 0 else 0
    error_types = [match.ruleIssueType for match in matches]
    return error_count, errors_per_100_words, error_types

def error_summary(df, label=""):
    total_errors = 0
    total_words = 0
    error_type_counter = defaultdict(int)

    for i, text in enumerate(df['Essay Text']):
        print(f"[{label}] Analyzing essay {i+1}/{len(df)}...")
        try:
            count, _, types = analyze_errors(text)
            total_errors += count
            total_words += len(text.split())
            for et in types:
                error_type_counter[et] += 1
        except Exception as e:
            print(f"Error on essay {i+1}: {e}")
    
    overall_errors_per_100 = (total_errors / total_words) * 100 if total_words > 0 else 0
    return total_errors, overall_errors_per_100, dict(error_type_counter)


In [None]:
ai_errors, ai_err_rate, ai_err_types = error_summary(df_ai, label="AI")
human_errors, human_err_rate, human_err_types = error_summary(df_human, label="Human")

In [None]:
print(f"\nAI Essays: {ai_errors} total errors, {ai_err_rate:.2f} errors per 100 words")
print(f"Human Essays: {human_errors} total errors, {human_err_rate:.2f} errors per 100 words")


In [None]:
# Create DataFrame
error_df = pd.DataFrame([ai_err_types, human_err_types], index=['AI Essays', 'Human Essays']).fillna(0).astype(int).T
error_df['Difference'] = error_df['AI Essays'] - error_df['Human Essays']
error_df = error_df.sort_values(by='Difference', ascending=False)

# Print
display(error_df)

In [None]:
# Absolute Counts
error_df[['AI Essays', 'Human Essays']].plot(kind='bar', figsize=(12,6))
plt.title('Error Types in AI vs. Human Essays (First 5)')
plt.ylabel('Count')
plt.xlabel('Error Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Normalized Stacked Bar Chart
error_percent = error_df[['AI Essays', 'Human Essays']].div(
    error_df[['AI Essays', 'Human Essays']].sum(axis=0), axis=1
)
error_percent.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='tab10')
plt.title('Proportional Error Types in AI vs. Human Essays (First 5)')
plt.ylabel('Proportion')
plt.xlabel('Error Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()