<a href="https://colab.research.google.com/github/fi-co/Distinguishing-AI-Generated-and-Human-Written-Scientific-Texts-/blob/main/Feature_extraction_(HLT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [None]:
import pandas as pd
import numpy as np
import re

**MODEL**

In [None]:

# Initialize feature extraction function

def extract_features(df):
    features = pd.DataFrame()

    def sentence_lengths(text):
        sentences = re.split(r'[.!?]', text)
        lengths = [len(sentence.split()) for sentence in sentences if sentence.strip()]
        return lengths

    def contains_frequent_capitals(text):
        return len(re.findall(r'[A-Z]', text)) / max(1, text.count('.'))

    def average_sentence_length(text):
        lengths = sentence_lengths(text)
        return np.mean(lengths) if lengths else 0

    def lexical_diversity(text):
        words = re.findall(r'\b\w+\b', text.lower())
        return len(set(words)) / len(words) if words else 0

    # Define a function to count the number of words in each text
    def word_count(text):
        words = re.findall(r'\b\w+\b', text)
        return len(words)

   # Feature set.
   # Those commented were found not discriminating. For transparency, I retained them.

    features['sentences_per_paragraph'] = df['Text'].apply(lambda x: len(re.split(r'[.!?]', x)) - 1)

    features['sentence_length_std'] = df['Text'].apply(lambda x: np.std(sentence_lengths(x)) if len(sentence_lengths(x)) > 1 else 0)

    #features['repetitive_paragraph_structure'] = df['Text'].apply(lambda x: 1 if len(set(re.split(r'[.!?]', x))) < 2 else 0)

    features['word_count'] = df['Text'].apply(word_count)

    features['average_sentence_length'] = df['Text'].apply(average_sentence_length)

    features['lexical_diversity'] = df['Text'].apply(lexical_diversity)

    features['contains_colon_semicolon'] = df['Text'].str.contains(r'[;:]', case=False, regex=True).astype(int)

    features['contains_question_mark'] = df['Text'].str.contains(r'\?', case=False, regex=True).astype(int)

    features['contains_apostrophe'] = df['Text'].str.contains(r"'", case=False, regex=True).astype(int)

    #features['sentence_length_uniformity'] = df['Text'].apply(lambda x: 1 if len(set(sentence_lengths(x))) == 1 else 0)

    #features['repetitive_sentence_structures'] = df['Text'].apply(lambda x: repetitive_structure(sentence_lengths(x)))

    features['contains_although'] = df['Text'].str.contains(r'\balthough\b', case=False, regex=True).astype(int)

    features['contains_connectors'] = df['Text'].apply(lambda x: int(any(connector in x.lower() for connector in ['also', 'in addition'])))

    features['contains_however'] = df['Text'].str.contains(r'\bhowever\b', case=False, regex=True).astype(int)

    features['contains_others_researchers'] = df['Text'].str.contains(r'\bothers\b|\bresearchers\b', case=False, regex=True).astype(int)

    features['contains_numbers'] = df['Text'].str.contains(r'\d', case=False, regex=True).astype(int)

    features['contains_delve'] = df['Text'].str.contains(r'\b(delve|delves)\b', case=False, regex=True).astype(int)

    features['capitals_to_periods_ratio'] = df['Text'].apply(lambda x: contains_frequent_capitals(x))

    features['contains_et'] = df['Text'].str.contains(r'\bet\b', case=False, regex=True).astype(int)

    return features

In [None]:

# Load data from two CSV files with correct delimiters

df_human = pd.read_csv('file_name', delimiter=';')
df_ai = pd.read_csv('file_name', delimiter=';')


# Ensure labels are in the DataFrames (assume column name is 'Label')
# If the label column has a different name, replace 'Label' with the correct name.

assert 'Label' in df_human.columns, "Label column missing in human.csv"
assert 'Label' in df_ai.columns, "Label column missing in ai.csv"

# Check structure of the loaded data
#print(df_human.head())
#print(df_ai.head())

# Concatenate the dataframes

df = pd.concat([df_human, df_ai], ignore_index=True)

In [None]:

# Separate the labels from the text

labels = df['Label']
texts = df['Text']


# Create a new DataFrame with the text for feature extraction

feature_df = pd.DataFrame({'Text': texts})


# Extract features

X = extract_features(feature_df)


# Combine features and labels into a final dataset

feature_map = pd.concat([X, labels.reset_index(drop=True)], axis=1)

# Save features to a new csv file
feature_map.to_csv('features.csv', index=False)

  features['contains_delve'] = df['Text'].str.contains(r'\b(delve|delves)\b', case=False, regex=True).astype(int)


variance analysis


In [None]:

# Analyze variance

feature_variance = X.var()
print(feature_variance[feature_variance == 0])

Series([], dtype: float64)
