In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Flatten
from imblearn.over_sampling import SMOTE

# Load the datasets
df1 = pd.read_json("domain1_train_data.json", lines=True)
df2 = pd.read_json("domain2_train_data.json", lines=True)

# Function to prepare text data from token IDs to a space-separated string
def prepare_text(data):
    data['text'] = data['text'].apply(lambda x: ' '.join(map(str, x)))
    return data

# Prepare the text data
df1 = prepare_text(df1)
df2 = prepare_text(df2)

# Combine the datasets
combined_df = pd.concat([df1, df2], ignore_index=True)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Vectorize the combined text data
X = vectorizer.fit_transform(combined_df['text'])
y = combined_df['label']

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_conf_matrix = confusion_matrix(y_test, svm_pred)
svm_class_report = classification_report(y_test, svm_pred)

print("Support Vector Machine (SVM) Results:")
print(f"Accuracy: {svm_accuracy}")
print("Confusion Matrix:")
print(svm_conf_matrix)
print("Classification Report:")
print(svm_class_report)

# 1D Convolutional Neural Network (CNN)
# Convert text data to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_df['text'])
X_seq = tokenizer.texts_to_sequences(combined_df['text'])
max_sequence_length = max([len(seq) for seq in X_seq])
X_pad = pad_sequences(X_seq, maxlen=max_sequence_length, padding='post')

# Split the data into training and test sets
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(
    X_pad, y, test_size=0.2, random_state=42, stratify=y
)

cnn_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    Conv1D(64, 3, activation='relu'),
    MaxPooling1D(4),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_seq, y_train_seq, epochs=5, batch_size=64, validation_split=0.2)

cnn_pred = (cnn_model.predict(X_test_seq) > 0.5).astype("int32")
cnn_accuracy = accuracy_score(y_test_seq, cnn_pred)
cnn_conf_matrix = confusion_matrix(y_test_seq, cnn_pred)
cnn_class_report = classification_report(y_test_seq, cnn_pred)

print("\n1D Convolutional Neural Network (CNN) Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Confusion Matrix:")
print(cnn_conf_matrix)
print("Classification Report:")
print(cnn_class_report)

# LSTM Model
lstm_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_seq, y_train_seq, epochs=5, batch_size=64, validation_split=0.2)

lstm_pred = (lstm_model.predict(X_test_seq) > 0.5).astype("int32")
lstm_accuracy = accuracy_score(y_test_seq, lstm_pred)
lstm_conf_matrix = confusion_matrix(y_test_seq, lstm_pred)
lstm_class_report = classification_report(y_test_seq, lstm_pred)

print("\nLong Short-Term Memory (LSTM) Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Confusion Matrix:")
print(lstm_conf_matrix)
print("Classification Report:")
print(lstm_class_report)


In [None]:
#I got the accuracy of 72% with this, i want to increase it further

Training:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import shap

# Load your datasets
df1 = pd.read_json("/content/domain1_train_data.json", lines=True)
df2 = pd.read_json("/content/domain2_train_data.json", lines=True)

# Function to prepare text data from token IDs to a space-separated string
def prepare_text(data):
    data['text'] = data['text'].apply(lambda x: ' '.join(map(str, x)))
    return data

# Apply text preparation
df1 = prepare_text(df1)
df2 = prepare_text(df2)

# Combine datasets
combined_df = pd.concat([df1, df2], ignore_index=True)

# Vectorize text data with TfidfVectorizer instead of CountVectorizer
vectorizer = TfidfVectorizer(max_features=10000)  # limit to 10000 most important features
X = vectorizer.fit_transform(combined_df['text'])
y = combined_df['label']

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split data into training and testing sets using stratified split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

# Setup a Logistic Regression model within a GridSearchCV to tune hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']  # 'saga' works well with large datasets and supports l1 penalty
}
model = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=StratifiedKFold(5), scoring='accuracy')
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Best Parameters: {model.best_params_}")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

# Model Explainability with SHAP
explainer = shap.LinearExplainer(model.best_estimator_, X_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=vectorizer.get_feature_names_out())

#Testing prediction code:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE

# Assuming the earlier defined functions and model training code is available and executed
# ... [All the earlier code for model training]

# Load the test dataset
df_test = pd.read_json("/content/test_data.json", lines=True)

# Prepare the test text data from token IDs to a space-separated string
df_test['text'] = df_test['text'].apply(lambda x: ' '.join(map(str, x)))

# Transform the test data using the fitted TfidfVectorizer from the training
X_test = vectorizer.transform(df_test['text'])

# Predict on the test data using the trained model
test_predictions = model.predict(X_test)

# Store the predictions in a dataframe
results_df = pd.DataFrame({
    'id': df_test['id'],
    'predicted_label': test_predictions
})

# Export the predictions to a CSV file
results_df.to_csv('test_predictions.csv', index=False)

# Since we are working in a simulated Python environment,
# the file will be saved in the virtual storage provided.
# You would need to adjust the file path according to your environment.
