In [1]:
from google.colab import drive     # Mounting the drive on colab notebook
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

data = pd.read_csv('/content/drive/My Drive/Colab Notebooks- 1804/comp1804_coursework_dataset_23-24.csv')


# Data Exploration and Assessment


In [3]:
# Removing duplicates
duplicates_rows = data.duplicated()
print(f'There are {duplicates_rows.sum()} duplicates rows.')
data = data[~duplicates_rows]

There are 210 duplicates rows.


In [4]:
# Shape attribute
print(data.shape)


(9137, 8)


In [5]:
# Column attribute
print(data.columns)

Index(['par_id', 'paragraph', 'has_entity', 'lexicon_count', 'difficult_words',
       'last_editor_gender', 'category', 'text_clarity'],
      dtype='object')


In [6]:
# Data types of all features
print(data.dtypes)

par_id                  int64
paragraph              object
has_entity             object
lexicon_count           int64
difficult_words       float64
last_editor_gender     object
category               object
text_clarity           object
dtype: object


In [7]:
# Head function
print(data.head())

         par_id                                          paragraph  \
0  428209002237  Ramsay was born in Glasgow on 2 October 1852. ...   
1  564218010072  It has been widely estimated for at least the ...   
2  291401001672  He went on to win the Royal Medal of the Royal...   
3   31548004883  The changes have altered many underlying assum...   
4   50634005146  After these novels were published, Disraeli de...   

                        has_entity  lexicon_count  difficult_words  \
0   ORG_YES_PRODUCT_NO_PERSON_YES_             49             12.0   
1    ORG_YES_PRODUCT_NO_PERSON_NO_            166             47.0   
2    ORG_YES_PRODUCT_NO_PERSON_NO_             69             18.0   
3    ORG_NO_PRODUCT_YES_PERSON_NO_             76             27.0   
4  ORG_YES_PRODUCT_YES_PERSON_YES_            200             47.0   

  last_editor_gender                 category      text_clarity  
0                man              biographies      clear_enough  
1                man  arti

In [8]:
# Sample function
print(data.sample(10))

            par_id                                          paragraph  \
1115  452865009663  In 1987, the Biotechnology Laboratory, one of ...   
6985  831020011691  In computing, POSIX Threads, commonly known as...   
1663    6360003821  It has been argued AI will become so powerful ...   
7694  156701007067  Prior to the award of the Nobel Prize in Physi...   
1710  792029011421  One argument against dualism is with regard to...   
2135   19675000468  Robert Hooke FRS  was an English naturalist, a...   
1217  396559009369  If it can be assumed that anything that can sw...   
3852   19980004590  Since, however, all output truth values are co...   
1602  196502007392  Some critics argue that Pascal's wager, for th...   
6940  657766010683   Hunter had taught Edward Jenner who is seen a...   

                          has_entity  lexicon_count  difficult_words  \
1115   ORG_YES_PRODUCT_NO_PERSON_NO_             58             17.0   
6985   ORG_YES_PRODUCT_NO_PERSON_NO_             86 

In [9]:
# Statistics about the numerical features
print(data.describe())

             par_id  lexicon_count  difficult_words
count  9.137000e+03    9137.000000      9119.000000
mean   3.571775e+11      82.023312        21.532405
std    3.220746e+11      63.448698        16.311847
min    8.500328e+07       0.000000         0.000000
25%    7.019601e+10      33.000000         9.000000
50%    2.684380e+11      64.000000        17.000000
75%    6.124310e+11     117.000000        30.000000
max    1.058779e+12     653.000000       143.000000


In [10]:
# Value count of categorical columns
categorical_columns= ['has_entity', 'last_editor_gender', 'category', 'text_clarity']

for col in categorical_columns:
  print(f"Categories and number of occurrences for '{col}'")
  print(data[col].value_counts())
  print()


Categories and number of occurrences for 'has_entity'
ORG_YES_PRODUCT_NO_PERSON_YES_     2999
ORG_NO_PRODUCT_NO_PERSON_NO_       2795
ORG_YES_PRODUCT_NO_PERSON_NO_      1448
ORG_NO_PRODUCT_NO_PERSON_YES_      1349
ORG_YES_PRODUCT_YES_PERSON_YES_     292
ORG_YES_PRODUCT_YES_PERSON_NO_      124
ORG_NO_PRODUCT_YES_PERSON_YES_       64
ORG_NO_PRODUCT_YES_PERSON_NO_        42
data missing                         24
Name: has_entity, dtype: int64

Categories and number of occurrences for 'last_editor_gender'
man                  6105
woman                2414
non-binary            353
prefer_not_to_say     265
Name: last_editor_gender, dtype: int64

Categories and number of occurrences for 'category'
biographies                             2888
philosophy                              2513
programming                             1939
artificial intelligence                 1527
movies about artificial intelligence     162
Philosophy                                13
Biographies               

In [11]:
# Missing values
print(data.isna().sum())

par_id                   0
paragraph                0
has_entity               0
lexicon_count            0
difficult_words         18
last_editor_gender       0
category                61
text_clarity          9057
dtype: int64


# Data Splitting and Cleaning

In [12]:
import numpy as np

# Replacing "data missing" with NaN in "has_entity" column
data.loc[data['has_entity'] == 'data missing', 'has_entity'] = np.nan

# Let's verify the changes
print(data['has_entity'].value_counts(dropna=False))

ORG_YES_PRODUCT_NO_PERSON_YES_     2999
ORG_NO_PRODUCT_NO_PERSON_NO_       2795
ORG_YES_PRODUCT_NO_PERSON_NO_      1448
ORG_NO_PRODUCT_NO_PERSON_YES_      1349
ORG_YES_PRODUCT_YES_PERSON_YES_     292
ORG_YES_PRODUCT_YES_PERSON_NO_      124
ORG_NO_PRODUCT_YES_PERSON_YES_       64
ORG_NO_PRODUCT_YES_PERSON_NO_        42
NaN                                  24
Name: has_entity, dtype: int64


In [13]:
# Missing values from all of the features
print(data.isna().sum())

par_id                   0
paragraph                0
has_entity              24
lexicon_count            0
difficult_words         18
last_editor_gender       0
category                61
text_clarity          9057
dtype: int64


In [14]:
from sklearn.impute import SimpleImputer

# Droping rows with missing values in "has_entity" and "category" columns
data.dropna(subset=['has_entity', 'category'], inplace=True)

# Imputing missing values in "difficult_words" column using the mean stragtegy
impt_difficult_words = SimpleImputer(strategy='mean')
data['difficult_words'] = impt_difficult_words.fit_transform(data[['difficult_words']])

# Verifying missing values after cleaning
missing_values_after_cleaning = data.isnull().sum()
print("Missing values after cleaning:\n", missing_values_after_cleaning)

Missing values after cleaning:
 par_id                   0
paragraph                0
has_entity               0
lexicon_count            0
difficult_words          0
last_editor_gender       0
category                 0
text_clarity          8972
dtype: int64


In [15]:
# Converting all labels in "category" column to lowercase
data['category'] = data['category'].str.lower()

# Mapping old labels to new labels
merge_classes = {
    'philosophy': 'philosophy',
    'biographies': 'biographies',
    'programming': 'programming',
    'artificial intelligence': 'artificial intelligence',
    'movies about artificial intelligence': 'movies about artificial intelligence'
}

# Updating the 'category' column
data['category'] = data['category'].map(merge_classes)

# Updated distribution of classes in "category" column
print(data['category'].value_counts())

biographies                             2891
philosophy                              2521
programming                             1944
artificial intelligence                 1534
movies about artificial intelligence     162
Name: category, dtype: int64


In [16]:
# Let's recheck the missing values again
print(data.isna().sum())

par_id                   0
paragraph                0
has_entity               0
lexicon_count            0
difficult_words          0
last_editor_gender       0
category                 0
text_clarity          8972
dtype: int64


In [17]:
from sklearn.model_selection import train_test_split

# Task 1
X_t1 = data[['paragraph', 'has_entity']] # Input Features for Task 1
Y_t1 = data['category']  # Target variable for Task 1

# Stratified split for Task 1
X_train_t1, X_temp_t1, Y_train_t1, Y_temp_t1 = train_test_split(X_t1, Y_t1, test_size=0.3, random_state=42, stratify=Y_t1)
X_val_t1, X_test_t1, Y_val_t1, Y_test_t1 = train_test_split(X_temp_t1, Y_temp_t1, test_size=0.5, random_state=42, stratify=Y_temp_t1)

print("Task 1:")
print("Train set shape:", X_train_t1.shape, Y_train_t1.shape)
print("Validation set shape:", X_val_t1.shape, Y_val_t1.shape)
print("Test set shape:", X_test_t1.shape, Y_test_t1.shape)

# Task 2
# Creatingg a subset
subset = data.dropna(subset=['text_clarity'])[['paragraph', 'lexicon_count', 'difficult_words', 'text_clarity']]

X_t2 = subset[['paragraph', 'lexicon_count', 'difficult_words']] # Input Features for Task 2
Y_t2 = subset['text_clarity'] # Target Variable for Task 2

# Splitting the subset into train/validation/test datasets
X_train_val_2, X_test_2, Y_train_val_2, Y_test_2 = train_test_split(X_t2, Y_t2, test_size=0.15, random_state=42, stratify=Y_t2)
X_train_2, X_val_2, Y_train_2, Y_val_2 = train_test_split(X_train_val_2, Y_train_val_2, test_size=0.15/0.85, random_state=42, stratify=Y_train_val_2)

# Lengths of train/validation/test datasets
print("Train dataset length:", len(X_train_2))
print("Validation dataset length:", len(X_val_2))
print("Test dataset length:", len(X_test_2))

Task 1:
Train set shape: (6336, 2) (6336,)
Validation set shape: (1358, 2) (1358,)
Test set shape: (1358, 2) (1358,)
Train dataset length: 56
Validation dataset length: 12
Test dataset length: 12


# Data Encoding

In [18]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Function for text pre-processing
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Applying text preprocessing to "paragraph" column for Task 1
X_train_t1['paragraph_preprocessed'] = X_train_t1['paragraph'].apply(preprocess_text)
X_val_t1['paragraph_preprocessed'] = X_val_t1['paragraph'].apply(preprocess_text)
X_test_t1['paragraph_preprocessed'] = X_test_t1['paragraph'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_features=1200)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_t1['paragraph_preprocessed'])
X_val_tfidf = tfidf_vectorizer.transform(X_val_t1['paragraph_preprocessed'])
X_test_tfidf = tfidf_vectorizer.transform(X_test_t1['paragraph_preprocessed'])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
# One-hot encoding on "has_entity"
def one_hot_encoding(text):
    entities = text.split('_')
    encoded = {'ORG': 0, 'PRODUCT': 0, 'PERSON': 0}
    for i in range(0, len(entities) - 1, 2):
        entity = entities[i]
        status = entities[i + 1] if i + 1 < len(entities) else ''
        if status == 'YES':
            if 'ORG' in entity:
                encoded['ORG'] = 1
            elif 'PRODUCT' in entity:
                encoded['PRODUCT'] = 1
            elif 'PERSON' in entity:
                encoded['PERSON'] = 1
    return encoded['ORG'], encoded['PRODUCT'], encoded['PERSON']

# Applying one-hot encoding to "has_entity" column for Task 1
X_train_t1['org'], X_train_t1['prod'], X_train_t1['person'] = zip(*X_train_t1['has_entity'].apply(one_hot_encoding))
X_val_t1['org'], X_val_t1['prod'], X_val_t1['person'] = zip(*X_val_t1['has_entity'].apply(one_hot_encoding))
X_test_t1['org'], X_test_t1['prod'], X_test_t1['person'] = zip(*X_test_t1['has_entity'].apply(one_hot_encoding))


In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenizing "paragraph" text for Task 2
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_2['paragraph'])
X_train_seq = tokenizer.texts_to_sequences(X_train_2['paragraph'])
X_val_seq = tokenizer.texts_to_sequences(X_val_2['paragraph'])
X_test_seq = tokenizer.texts_to_sequences(X_test_2['paragraph'])

# Padding sequences
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


In [21]:
from sklearn.preprocessing import LabelEncoder

# Initializing LabelEncoder
label_encoder = LabelEncoder()

# Fitting and transforming target labels for training data
Y_train_encoded_2 = label_encoder.fit_transform(Y_train_2)
Y_val_encoded_2 = label_encoder.transform(Y_val_2)
Y_test_encoded_2 = label_encoder.transform(Y_test_2)


In [22]:
# Dropping original columns
X_train_t1.drop(columns=['paragraph', 'has_entity'], inplace=True)
X_val_t1.drop(columns=['paragraph', 'has_entity'], inplace=True)
X_test_t1.drop(columns=['paragraph', 'has_entity'], inplace=True)

In [23]:
# Verifying column-drop
print(X_train_t1.head(3))
print(X_val_t1.head(3))
print(X_test_t1.head(3))


                                 paragraph_preprocessed  org  prod  person
6008  although george darwin son famous biologist ch...    1     0       1
3051  extreme goal ai research create computer progr...    1     0       0
6552  einstein two events taking place points b syst...    1     0       0
                                 paragraph_preprocessed  org  prod  person
235   thomson first investigated magnetic deflection...    1     0       0
575   process breaking concept proposition fact simp...    1     0       1
4937  propositions also spoken content beliefs simil...    0     0       0
                                 paragraph_preprocessed  org  prod  person
8552  socratic seminars based upon interaction peers...    0     0       0
2329  discipline history philosophy aims provide sys...    0     0       0
4859  building graphical user interfaces way testing...    1     0       0


# Task 1 : Topic Classification

In [24]:
# Logistic Regression model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Initializing StandardScaler
scaler = StandardScaler(with_mean=False)

# Scaling TF-IDF transformed features
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf)
X_val_tfidf_scaled = scaler.transform(X_val_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

# Combining TF-IDF transformed features with one-hot encoded "has_entity" features
X_train_combined = hstack([X_train_tfidf_scaled, X_train_t1[['org', 'prod', 'person']].values])
X_val_combined = hstack([X_val_tfidf_scaled, X_val_t1[['org', 'prod', 'person']].values])
X_test_combined = hstack([X_test_tfidf_scaled, X_test_t1[['org', 'prod', 'person']].values])

# Hyperparameters to tune
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [ 0.1, 1, 10],
    'solver': [ 'saga'],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000]
}

# Initializing GridSearchCV with Logistic Regression model and hyperparameters
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=3, scoring='accuracy', verbose=1)

# Fitting GridSearchCV to find the best hyperparameters
grid_search.fit(X_train_combined, Y_train_t1)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Initializing Logistic Regression model with the best hyperparameters
best_logistic_model = LogisticRegression(**best_params, random_state=42)

# Training the model on the combined features
best_logistic_model.fit(X_train_combined, Y_train_t1)

# Predicting on the training set using the trained model
train_predictions_lg = best_logistic_model.predict(X_train_combined)

# Predicting on the validation set using the best model
val_predictions_lg = best_logistic_model.predict(X_val_combined)

# Predicting on the test set using the best model
test_predictions_lg = best_logistic_model.predict(X_test_combined)

# Calculating accuracy on the training set
train_accuracy_lg = accuracy_score(Y_train_t1, train_predictions_lg)
print("Training Accuracy:", train_accuracy_lg)

# Calculating accuracy on the validation set
val_accuracy_lg = accuracy_score(Y_val_t1, val_predictions_lg)
print("Validation Accuracy (Logistic Regression):", val_accuracy_lg)

# Calculating accuracy on the test set
test_accuracy_lg = accuracy_score(Y_test_t1, test_predictions_lg)
print("Test Accuracy (Logistic Regression):", test_accuracy_lg)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 

In [25]:
# Random Forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Initializing StandardScaler
scaler = StandardScaler(with_mean=False)

# Scaling TF-IDF transformed features
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf)
X_val_tfidf_scaled = scaler.transform(X_val_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

# Combining TF-IDF transformed features with one-hot encoded "has_entity" features
X_train_combined = hstack([X_train_tfidf_scaled, X_train_t1[['org', 'prod', 'person']].values])
X_val_combined = hstack([X_val_tfidf_scaled, X_val_t1[['org', 'prod', 'person']].values])
X_test_combined = hstack([X_test_tfidf_scaled, X_test_t1[['org', 'prod', 'person']].values])


# Initializing Random Forest model
random_forest_model = RandomForestClassifier(random_state=42)

# Hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initializing GridSearchCV with Random Forest model and hyperparameters
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

# Fitting GridSearchCV to find the best hyperparameters
grid_search_rf.fit(X_train_combined, Y_train_t1)

# Best hyperparameters
best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters:", best_params_rf)

# Initializing Random Forest model with the best hyperparameters
best_random_forest_model = RandomForestClassifier(**best_params_rf, random_state=42)

# Training the model on the combined features
best_random_forest_model.fit(X_train_combined, Y_train_t1)

# Predict on the training set using the trained model
train_predictions_rf = best_random_forest_model.predict(X_train_combined)

# Predicting on the validation set using the best model
val_predictions_rf = best_random_forest_model.predict(X_val_combined)

# Predicting on the test set using the best model
test_predictions_rf = best_random_forest_model.predict(X_test_combined)

# Calculating accuracy on the training set
train_accuracy_rf = accuracy_score(Y_train_t1, train_predictions_rf)
print("Training Accuracy:", train_accuracy_rf)

# Calculating accuracy on the validation set
val_accuracy_rf = accuracy_score(Y_val_t1, val_predictions_rf)
print("Validation Accuracy (Random Forest):", val_accuracy_rf)

# Calculating accuracy on the test set
test_accuracy_rf = accuracy_score(Y_test_t1, test_predictions_rf)
print("Test Accuracy (Random Forest):", test_accuracy_rf)


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Training Accuracy: 0.9977904040404041
Validation Accuracy (Random Forest): 0.8232695139911634
Test Accuracy (Random Forest): 0.8136966126656848


In [26]:
# Support Vector Machine (SVM) model
from sklearn.svm import SVC

# Initializing StandardScaler
scaler = StandardScaler(with_mean=False)

# Scaling TF-IDF transformed features
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf)
X_val_tfidf_scaled = scaler.transform(X_val_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

# Combining TF-IDF transformed features with one-hot encoded "has_entity" features
X_train_combined = hstack([X_train_tfidf_scaled, X_train_t1[['org', 'prod', 'person']].values])
X_val_combined = hstack([X_val_tfidf_scaled, X_val_t1[['org', 'prod', 'person']].values])
X_test_combined = hstack([X_test_tfidf_scaled, X_test_t1[['org', 'prod', 'person']].values])

# Hyperparameters to tune
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'class_weight': [None, 'balanced']
}

# Initializing GridSearchCV with SVM model and hyperparameters
svm_grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=3, scoring='accuracy')

# Fitting GridSearchCV to find the best hyperparameters
svm_grid_search.fit(X_train_combined, Y_train_t1)

# Best hyperparameters
best_svm_params = svm_grid_search.best_params_
print("Best SVM Hyperparameters:", best_svm_params)

# Initializing SVM model with the best hyperparameters
best_svm_model = SVC(**best_svm_params, random_state=42)

# Training the model on the combined features
best_svm_model.fit(X_train_combined, Y_train_t1)

# Predicting on the training set using the trained model
train_predictions_svm = best_svm_model.predict(X_train_combined)

# Predicting on the validation set using the best model
val_predictions_svm = best_svm_model.predict(X_val_combined)

# Predicting on the test set using the best model
test_predictions_svm = best_svm_model.predict(X_test_combined)

# Calculating accuracy on the training set
train_accuracy_svm = accuracy_score(Y_train_t1, train_predictions_svm)
print("Training Accuracy:", train_accuracy_svm)

# Calculating accuracy on the validation set
val_accuracy_svm = accuracy_score(Y_val_t1, val_predictions_svm)
print("SVM Validation Accuracy:", val_accuracy_svm)

# Calculating accuracy on the test set
test_accuracy_svm = accuracy_score(Y_test_t1, test_predictions_svm)
print("SVM Test Accuracy:", test_accuracy_svm)


Best SVM Hyperparameters: {'C': 0.1, 'class_weight': 'balanced', 'kernel': 'sigmoid'}
Training Accuracy: 0.8969381313131313
SVM Validation Accuracy: 0.8755522827687776
SVM Test Accuracy: 0.8807069219440353


In [27]:
# Evaluating the final model (SVM)

from sklearn.metrics import confusion_matrix, classification_report

# Confusion Matrix
conf_matrix = confusion_matrix(Y_test_t1, test_predictions_svm)

# Classification Report
class_report = classification_report(Y_test_t1, test_predictions_svm)

# Accuracy
accuracy = accuracy_score(Y_test_t1, test_predictions_svm)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("\nAccuracy:", accuracy)


Confusion Matrix:
[[202   4   2  12  10]
 [ 14 362   4  48   6]
 [  0   0  25   0   0]
 [ 16  13   1 341   7]
 [  7   2   1  15 266]]

Classification Report:
                                      precision    recall  f1-score   support

             artificial intelligence       0.85      0.88      0.86       230
                         biographies       0.95      0.83      0.89       434
movies about artificial intelligence       0.76      1.00      0.86        25
                          philosophy       0.82      0.90      0.86       378
                         programming       0.92      0.91      0.92       291

                            accuracy                           0.88      1358
                           macro avg       0.86      0.91      0.88      1358
                        weighted avg       0.89      0.88      0.88      1358


Accuracy: 0.8807069219440353


# Task 2 : Text Clarity Classification Prototype

In [28]:
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from keras.losses import binary_crossentropy
import numpy as np

# Concatenating lexicon count and difficult words columns with padded sequences
X_train_pad_with_features = np.concatenate((X_train_pad, X_train_2[['lexicon_count', 'difficult_words']].values), axis=1)
X_val_pad_with_features = np.concatenate((X_val_pad, X_val_2[['lexicon_count', 'difficult_words']].values), axis=1)
X_test_pad_with_features = np.concatenate((X_test_pad, X_test_2[['lexicon_count', 'difficult_words']].values), axis=1)

# Hyperparameters
num_words = len(tokenizer.word_index) + 1
embedding_dim = 100
learning_rate = 0.001
batch_size = 32
num_epochs = 10
max_len = 100

# Main input for the text data
main_input = Input(shape=(max_len,), dtype='int32', name='main_input')

# Embedding layer for text data
embedding_layer = Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_len)(main_input)
lstm_output = LSTM(128)(embedding_layer)

# Ddditional input layers for lexicon_count and difficult_words
lexicon_input = Input(shape=(2,), name='lexicon_input')

# Concatenating the LSTM output with additional features
concatenated = Concatenate()([lstm_output, lexicon_input])

# Dense output layer with sigmoid activation for binary classification
output = Dense(1, activation='sigmoid')(concatenated)

# Defining the model
model = Model(inputs=[main_input, lexicon_input], outputs=output)

# Compiling the model
model.compile(loss=binary_crossentropy, optimizer='adam', metrics=['accuracy'])

# Training the model
history = model.fit([X_train_pad, X_train_2[['lexicon_count', 'difficult_words']].values], Y_train_encoded_2,
                    validation_data=([X_val_pad, X_val_2[['lexicon_count', 'difficult_words']].values], Y_val_encoded_2),
                    epochs=num_epochs, batch_size=batch_size)

# Evaluating the model on training data
train_loss, train_accuracy = model.evaluate([X_train_pad, X_train_2[['lexicon_count', 'difficult_words']].values], Y_train_encoded_2)
print("Training Loss:", train_loss)
print("Training Accuracy:", train_accuracy)

# Evaluating the model on validation data
val_loss, val_accuracy = model.evaluate([X_val_pad, X_val_2[['lexicon_count', 'difficult_words']].values], Y_val_encoded_2)
print("Validation Loss:", val_loss)
print("Validation Accuracy:", val_accuracy)

# Evaluating the model on test data
test_loss, test_accuracy = model.evaluate([X_test_pad, X_test_2[['lexicon_count', 'difficult_words']].values], Y_test_encoded_2)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Loss: 5.876242637634277
Training Accuracy: 0.2142857164144516
Validation Loss: 5.8593316078186035
Validation Accuracy: 0.25
Test Loss: 6.164886474609375
Test Accuracy: 0.25


In [29]:
from sklearn.metrics import confusion_matrix, classification_report

# Making predictions on test data
test_predictions = model.predict([X_val_pad, X_val_2[['lexicon_count', 'difficult_words']].values],)

# Converting predictions to binary labels
binary_predictions = (test_predictions > 0.5).astype(int)

# Getting true labels
true_labels = Y_test_encoded_2
# Computing confusion matrix
conf_matrix = confusion_matrix(true_labels, binary_predictions)

# Printing confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Computing classification report
class_report = classification_report(true_labels, binary_predictions)

# Printing classification report
print("\nClassification Report:")
print(class_report)


Confusion Matrix:
[[4 2]
 [3 3]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.67      0.62         6
           1       0.60      0.50      0.55         6

    accuracy                           0.58        12
   macro avg       0.59      0.58      0.58        12
weighted avg       0.59      0.58      0.58        12

