In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

In [116]:
# Step 1: Load the data from the Excel file
df = pd.read_excel('selected_essays 2.xlsx')


In [117]:
# Step 2: Separate the features (X) and the target variable (y)
target_column = 'ai_generated'
X = df.drop(columns=[target_column,'ai_llm', 'avg_sentence_length.1', 'ai_llm', 'word_tokens', 'sentence_tokens', 'lemmatized_word_tokens', 'word_freq', 'bigram_freq', 'trigram_freq'])
y = df[target_column]


In [118]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [119]:
# Step 4: Apply min-max scaling to ensure all features are non-negative
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [120]:
# Step 5: Perform feature selection using SelectKBest with chi-squared scoring
k = 7  # Replace 'k' with the number of top features you want to select
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

In [121]:
# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]

In [122]:
# Step 6: Train a machine learning model (Random Forest) using the selected features
model = RandomForestClassifier(random_state=42)
model.fit(X_train_selected, y_train)


In [123]:
# Step 7: Evaluate the model on the test set
accuracy = model.score(X_test_selected, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.8695652173913043


In [124]:
# Step 8: (Optional) Get feature importances from the trained Random Forest model
feature_importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


In [125]:
print("Feature Importances:")
print(feature_importance_df)

Feature Importances:
                       Feature  Importance
6                   smog_index    0.237797
1              stop_word_count    0.174178
4          flesch_reading_ease    0.162291
2         avg_parse_tree_depth    0.161151
0              avg_word_length    0.133219
5   flesch_kincaid_grade_level    0.074525
3  avg_adjectives_per_sentence    0.056840


In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [127]:
# Step 6: Hyperparameter tuning for RandomForestClassifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [128]:
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

In [129]:
# Best hyperparameters and best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [130]:
# Step 7: Trying different classification models
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train_selected, y_train)



In [131]:
# Step 8: Evaluate SVM model
svm_y_pred = svm_model.predict(X_test_selected)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_classification_report = classification_report(y_test, svm_y_pred)
print("SVM Model Accuracy:", svm_accuracy)
print("SVM Model Classification Report:")
print(svm_classification_report)


SVM Model Accuracy: 0.8260869565217391
SVM Model Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       0.86      0.86      0.86        14

    accuracy                           0.83        23
   macro avg       0.82      0.82      0.82        23
weighted avg       0.83      0.83      0.83        23



In [132]:
# Step 9: Trying Logistic Regression
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_selected, y_train)

In [133]:
# Step 10: Evaluate Logistic Regression model
logistic_y_pred = logistic_model.predict(X_test_selected)
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)
logistic_classification_report = classification_report(y_test, logistic_y_pred)
print("Logistic Regression Model Accuracy:", logistic_accuracy)
print("Logistic Regression Model Classification Report:")
print(logistic_classification_report)

Logistic Regression Model Accuracy: 0.8260869565217391
Logistic Regression Model Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78         9
           1       0.86      0.86      0.86        14

    accuracy                           0.83        23
   macro avg       0.82      0.82      0.82        23
weighted avg       0.83      0.83      0.83        23



In [136]:
# Step 11: Evaluate all models
rf_y_pred = grid_search.best_estimator_.predict(X_test_selected)
svm_y_pred = svm_model.predict(X_test_selected)
logistic_y_pred = logistic_model.predict(X_test_selected)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)

# Get unique class labels from the target variable
classes = sorted(y_test.unique())

rf_classification_report = classification_report(y_test, rf_y_pred, output_dict=True)
svm_classification_report = classification_report(y_test, svm_y_pred, output_dict=True)
logistic_classification_report = classification_report(y_test, logistic_y_pred, output_dict=True)


In [141]:
# Step 12: Create a summary DataFrame to compare the results
summary_df = pd.DataFrame({
    'Model': ['Random Forest', 'SVM', 'Logistic Regression'],
    'Accuracy': [rf_accuracy, svm_accuracy, logistic_accuracy],
    'Precision_Human': [logistic_classification_report['0']['precision'], svm_classification_report['0']['precision'], logistic_classification_report['0']['precision']],
    'Recall_Human': [logistic_classification_report['0']['recall'], svm_classification_report['0']['recall'], logistic_classification_report['0']['recall']],
    'F1-score_Human': [logistic_classification_report['0']['f1-score'], svm_classification_report['0']['f1-score'], logistic_classification_report['0']['f1-score']],
    'Precision_AI': [logistic_classification_report['1']['precision'], svm_classification_report['1']['precision'], logistic_classification_report['1']['precision']],
    'Recall_AI': [logistic_classification_report['1']['recall'], svm_classification_report['1']['recall'], logistic_classification_report['1']['recall']],
    'F1-score_AI': [logistic_classification_report['1']['f1-score'], svm_classification_report['1']['f1-score'], logistic_classification_report['1']['f1-score']]
})

# Display the summary DataFrame
print(summary_df)


                 Model  Accuracy  Precision_Human  Recall_Human  \
0        Random Forest  0.869565         0.777778      0.777778   
1                  SVM  0.826087         0.777778      0.777778   
2  Logistic Regression  0.826087         0.777778      0.777778   

   F1-score_Human  Precision_AI  Recall_AI  F1-score_AI  
0        0.777778      0.857143   0.857143     0.857143  
1        0.777778      0.857143   0.857143     0.857143  
2        0.777778      0.857143   0.857143     0.857143  


In [143]:
# Step 2: Cross-validation to evaluate the model's performance
from sklearn.model_selection import GridSearchCV, cross_val_score
cv_scores = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

# Step 3: Analyze feature importances
feature_importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': selected_feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(feature_importance_df)

# Finally, test the model on the test set
test_accuracy = best_rf_model.score(X_test_selected, y_test)
print("Test Accuracy:", test_accuracy)

Cross-Validation Scores: [0.94444444 0.94444444 0.88888889 0.77777778 0.88235294]
Mean CV Score: 0.8875816993464053
Feature Importances:
                       Feature  Importance
6                   smog_index    0.237797
1              stop_word_count    0.174178
4          flesch_reading_ease    0.162291
2         avg_parse_tree_depth    0.161151
0              avg_word_length    0.133219
5   flesch_kincaid_grade_level    0.074525
3  avg_adjectives_per_sentence    0.056840
Test Accuracy: 0.8695652173913043


In [144]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Evaluate the model on the test set
y_pred = best_rf_model.predict(X_test_selected)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Check for potential overfitting
train_accuracy = best_rf_model.score(X_train_selected, y_train)
print("Train Accuracy:", train_accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Precision: 0.9230769230769231
Recall: 0.8571428571428571
F1-score: 0.888888888888889
Train Accuracy: 1.0
Confusion Matrix:
[[ 8  1]
 [ 2 12]]


In [145]:
# Find misclassified instances
misclassified_indices = y_test.index[y_test != y_pred]

# Get the misclassified instances from the original test set
misclassified_instances = df.iloc[misclassified_indices]

# Display the misclassified instances
print("Misclassified Instances:")
print(misclassified_instances)

# Save the trained model for future use and deployment
import joblib
joblib.dump(best_rf_model, 'model_filename.pkl')


Misclassified Instances:
              ai_llm  ai_generated  \
73   human-generated             0   
97  text-davinci-003             1   
91  text-davinci-003             1   

                                          word_tokens  \
73  ['builders', 'attempting', 'build', 'dock', 'e...   
97  ['remember', 'one', 'day', 'high', 'school', '...   
91  ['line', 'one', 'evening', 'grocery', 'store',...   

                                      sentence_tokens  \
73  ['the builders that were attempting to build a...   
97  ['\n\ni remember one day in high school when i...   
91  ['\n\ni was in line one evening at a grocery s...   

                               lemmatized_word_tokens  total_word_count  \
73  ['builder', 'attempting', 'build', 'dock', 'em...               316   
97  ['remember', 'one', 'day', 'high', 'school', '...               186   
91  ['line', 'one', 'evening', 'grocery', 'store',...               197   

    avg_word_length  avg_sentence_length       TTR  stop_word_c

['model_filename.pkl']

In [147]:
# Find misclassified instances
misclassified_indices = y_test.index[y_test != y_pred]

# Drop misclassified instances from the original DataFrame
df_cleaned = df.drop(index=misclassified_indices)

# Display the cleaned DataFrame without the misclassified instances
df_cleaned.head()


Unnamed: 0,ai_llm,ai_generated,word_tokens,sentence_tokens,lemmatized_word_tokens,total_word_count,avg_word_length,avg_sentence_length,TTR,stop_word_count,...,avg_adjectives_per_sentence,avg_adverbs_per_sentence,avg_verbs_per_sentence,avg_nouns_per_sentence,flesch_reading_ease,flesch_kincaid_grade_level,smog_index,sentiment_polarity,sentiment.subjectivity,perplexity
0,human-generated,0,"['dear', 'caps1', 'editor', 'think', 'computer...","['dear @caps1 editor, i think computers are a ...","['dear', 'caps1', 'editor', 'think', 'computer...",485,3.950515,16.884615,0.397938,236,...,1.384615,1.307692,3.230769,3.769231,71.24,8,11.1,0.315309,0.642392,95.445848
1,human-generated,0,"['dear', 'caps1', 'post', 'computers', 'advanc...","['dear @caps1 post, computers are an advance i...","['dear', 'caps1', 'post', 'computer', 'advance...",576,3.715278,14.027778,0.425347,274,...,0.583333,1.166667,2.888889,3.166667,82.24,6,9.0,0.310374,0.491213,127.964146
2,human-generated,0,"['dear', 'caps1', 'paper', 'editor', 'think', ...","['dear @caps1 paper editor, i think computers ...","['dear', 'caps1', 'paper', 'editor', 'think', ...",220,3.722727,17.727273,0.5,96,...,1.636364,1.0,3.272727,5.454545,68.6,9,11.4,0.302273,0.537121,88.122713
3,human-generated,0,"['organization1', 'use', 'computer', 'per', 'f...","['@organization1, the use of computer per fami...","['organization1', 'use', 'computer', 'per', 'f...",777,3.886744,15.688889,0.365508,370,...,0.933333,1.044444,3.266667,4.177778,72.46,7,10.3,0.038129,0.412458,99.46226
4,human-generated,0,"['dear', 'computer', 'technology', 'think', 'c...",['dear computer technology i think computers h...,"['dear', 'computer', 'technology', 'think', 'c...",402,3.781095,26.357143,0.343284,206,...,0.714286,0.928571,5.285714,6.071429,70.06,12,11.8,0.290681,0.490789,91.679042


In [149]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the cleaned data (df_cleaned)
df = df_cleaned

# Step 2: Separate the features (X) and the target variable (y)
target_column = 'ai_generated'
X = df.drop(columns=[target_column,'ai_llm', 'avg_sentence_length.1', 'ai_llm', 'word_tokens', 'sentence_tokens', 'lemmatized_word_tokens', 'word_freq', 'bigram_freq', 'trigram_freq'])
y = df[target_column]

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Apply min-max scaling to ensure all features are in the same range
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Perform feature selection using SelectKBest with chi-squared scoring
k = 7  # Replace 'k' with the number of top features you want to select
selector = SelectKBest(score_func=chi2, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Step 6: Train a machine learning model (Random Forest) using the selected features
model = RandomForestClassifier(random_state=42)
model.fit(X_train_selected, y_train)

# Step 7: Evaluate the model on the test set
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

# Step 8: Print the evaluation results
print("Test Accuracy:", accuracy)
print("Classification Report:")
print(classification_report_result)


Test Accuracy: 0.9090909090909091
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        11
           1       0.91      0.91      0.91        11

    accuracy                           0.91        22
   macro avg       0.91      0.91      0.91        22
weighted avg       0.91      0.91      0.91        22



In [151]:
df_cleaned.to_excel('cleaned_df_project4.xlsx')