In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import string
import math
import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, cross_validate, validation_curve, learning_curve
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn import metrics


import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer


%matplotlib inline
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Download necessary NLTK resources for text processing
nltk.download('wordnet')  # WordNet for lemmatization
nltk.download('omw-1.4')  # Open Multilingual Wordnet
nltk.download('punkt')  # Tokenizer
nltk.download('stopwords')  # Stopwords for text cleaning
nltk.download('averaged_perceptron_tagger')  # POS tagger for part-of-speech tagging
nltk.download('tagsets_json')  # Tagset resource

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets_json to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets_json.zip.


True

## **Task 1**

In [None]:
!git clone https://github.com/dwhitfill/Ai-Vs-Human.git

Cloning into 'Ai-Vs-Human'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 18 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 2.77 MiB | 11.83 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [None]:
import pandas as pd
df_train = pd.read_excel('AI_vs_huam_train_dataset.xlsx')
df_test = pd.read_csv('Final_test_data.csv')

In [None]:
import string
from nltk.corpus import stopwords
def text_process(essays):


    if not isinstance(essays, str):
        return ""

    STOPWORDS = stopwords.words('english')


    nopunc = ''.join([char for char in essays if char not in string.punctuation])

    return ' '.join([word for word in ''.join([char for char in essays if char not in string.punctuation]).split() if word.lower() not in STOPWORDS and len(word) > 1 and not word.isspace() and not re.fullmatch(r'\d+(\.\d+)?', word) and not re.search(r'\d+[a-zA-Z]+|[a-zA-Z]+\d+', word)])

In [None]:
df_train['clean_essays'] = df_train['essay'].apply(text_process)

df_test['clean_essays'] = df_test['essay'].apply(text_process)

In [None]:
df_train.head()

Unnamed: 0,essay,label,clean_essays
0,International sports events require the most w...,0,International sports events require welltraine...
1,Globalisation has become a significant aspect ...,0,Globalisation become significant aspect world’...
2,There is an ever-increasing number of bullying...,0,everincreasing number bullying activities nume...
3,"It is commonly believed, that companies should...",0,commonly believed companies dress code policy ...
4,Despite knowing about the adverse effects of c...,0,Despite knowing adverse effects climate change...


In [None]:
from nltk.stem import WordNetLemmatizer

def text_lemmatizer(text):
    if not isinstance(text, str):
        return ""
    STOPWORDS = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    nopunc = ''.join([char for char in text if char not in string.punctuation])
    return ' '.join([
        lemmatizer.lemmatize(word) for word in nopunc.split()
        if word.lower() not in STOPWORDS
    ])

In [None]:
df_train['lemmatized_essays'] = df_train['essay'].apply(text_lemmatizer)
df_test['lemmatized_essays'] = df_test['essay'].apply(text_lemmatizer)

In [None]:
df_train['lemmatized_essays'].head()

Unnamed: 0,lemmatized_essays
0,International sport event require welltrained ...
1,Globalisation become significant aspect world’...
2,everincreasing number bullying activity numero...
3,commonly believed company dress code policy em...
4,Despite knowing adverse effect climate change ...


## **Task 2**

In [None]:
X = df_train['clean_essays']
y = df_train['label']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =0.2, random_state = 42, stratify=y)

In [None]:
vectorizer = CountVectorizer(binary=True)
X_train_dtm = vectorizer.fit_transform(X_train)
X_val_dtm = vectorizer.transform(X_val)

In [None]:
dyl = LogisticRegression(random_state=0).fit(X_train_dtm, y_train)
dyl.fit(X_train_dtm, y_train)

In [None]:
y_pred_class = dyl.predict(X_val_dtm)

In [None]:
acc = metrics.accuracy_score(y_val, y_pred_class)
acc

0.9785522788203753

In [None]:
vectorizer = TfidfVectorizer(max_features = 5000, ngram_range = (1,3), min_df = 2, max_df = .8)

In [None]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [None]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

In [None]:
y_pred_class_tfidf = nb_tfidf.predict(X_val_tfidf)

In [None]:
acc1 = metrics.accuracy_score(y_val, y_pred_class_tfidf)
acc1

0.9705093833780161

## **Task 3**

In [None]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', SVC(random_state=42))])

In [None]:
svm_param_grid = {
'vectorizer__max_features': [1000, 5000, 10000],
'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
'classifier__C': [0.1, 1, 10],
'classifier__kernel': ['linear', 'rbf'],
'classifier__gamma': ['scale', 'auto']
}

In [None]:
svm_grid_search = GridSearchCV(
    estimator= pipeline,
    param_grid=svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

In [None]:
svm_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_param_grid = {
'vectorizer__max_features': [1000, 5000],
'vectorizer__ngram_range': [(1,1), (1,2)],
'classifier__criterion': ['gini', 'entropy'],
'classifier__max_depth': [10, 20],
'classifier__min_samples_split': [2, 5],
'classifier__min_samples_leaf': [1, 2] }


In [None]:
pipelineDT = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', DecisionTreeClassifier(random_state=42))])

In [None]:
dt_grid_search = GridSearchCV(
    estimator= pipelineDT,
    param_grid=dt_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

In [None]:
dt_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


## **Task 4**

In [None]:
best_svm_model = svm_grid_search.best_estimator_
svm_scores = cross_val_score(best_svm_model, X, y, cv=5, scoring='accuracy', n_jobs=-1)

print("Accuracy Scores :", svm_scores)
print("Mean Accuracy:",np.mean(svm_scores))
print("Standard Deviation:",  np.std(svm_scores))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
best_dt_model = dt_grid_search.best_estimator_
dt_scores = cross_val_score(best_dt_model, X, y, cv=5, scoring='accuracy', n_jobs=-1)

print("Accuracy Scores:", dt_scores)
print("Mean Accuracy:", np.mean(dt_scores))
print("Standard Deviation:", np.std(dt_scores))

Accuracy Scores: [0.79356568 0.85254692 0.95710456 0.97315436 0.95033557]
Mean Accuracy: 0.90534141821257
Standard Deviation: 0.07011927830507317


In [None]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
svm_strat_scores = cross_val_score(best_svm_model, X, y, cv=strat_kfold, scoring='accuracy', n_jobs=-1)

print("Accuracy Scores:", svm_strat_scores)
print("Mean Accuracy:", np.mean(svm_strat_scores))
print("Standard Deviation:", np.std(svm_strat_scores))

Accuracy Scores: [0.98659517 0.97855228 0.97989276 0.98389262 0.97449664]
Mean Accuracy: 0.9806858952444356
Standard Deviation: 0.0042116707724733585


In [None]:
dt_strat_scores = cross_val_score(best_dt_model, X, y, cv=strat_kfold, scoring='accuracy', n_jobs=-1)

print("Accuracy Scores:", dt_strat_scores)
print("Mean Accuracy:", np.mean(dt_strat_scores))
print("Standard Deviation:", np.std(dt_strat_scores))

NameError: name 'strat_kfold' is not defined

##**Task 5 was done in 3**

In [None]:
results = pd.DataFrame({
    'essay_id': range(len(y_val)),
    'predicted_label': y_pred_class_tfidf
})
results.to_csv('Dylan_Whitfill_Assignment_2_R11903920.csv', index=False)

from google.colab import files
files.download('Dylan_Whitfill_Assignment_2_R11903920.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pickle

# Save only the best model (not the whole grid)
with open('svm_grid_model.pkl', 'wb') as f:
    pickle.dump(svm_grid_search.best_estimator_, f)


In [None]:
from google.colab import files
files.download('svm_grid_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pickle

In [None]:
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt_grid_search.best_estimator_, f)

In [None]:
from google.colab import files
files.download('decision_tree_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.ensemble import AdaBoostClassifier

pipelineADA = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), random_state=42))
])

# Define the param grid (use small one first to avoid long runs)
ada_param_grid = {
    'vectorizer__max_features': [1000],
    'vectorizer__ngram_range': [(1, 1)],
    'classifier__n_estimators': [50],
    'classifier__learning_rate': [1.0]
}

# Grid search
ada_grid_search = GridSearchCV(
    estimator=pipelineADA,
    param_grid=ada_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    return_train_score=True
)

# Fit the model
ada_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
with open('adaboost_model.pkl', 'wb') as f:
    pickle.dump(ada_grid_search.best_estimator_, f)

files.download('adaboost_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
tfidf_vectorizer = dt_grid_search.best_estimator_.named_steps['vectorizer']

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)



In [None]:
from google.colab import files
files.download('tfidf_vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>