First, we need to import the necessary libraries

In [1]:
from google.colab import drive 
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Connect to Google Drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


Read Train and Test CSV files

In [3]:
train=pd.read_csv('/content/drive/My Drive/DataAnalytics/Datasets/Dataset_1/train.csv')
test=pd.read_csv('/content/drive/My Drive/DataAnalytics/Datasets/Dataset_1/test.csv')

Concat title and Content

In [4]:
train['Text'] = train['Title'] + ' ' + train['Content']
test['Text'] = test['Title'] + ' ' + test['Content']

Cleanning

In [5]:
def clean_text(text):
  # remove punctuation
  text = re.sub(r'[^\w\s]', '', text)
  # lowercase the text
  text = text.lower()

  # remove stop words
  stop_words = set(nltk.corpus.stopwords.words('english'))
  tokens = nltk.word_tokenize(text)
  text = [token for token in tokens if token not in stop_words]
  
  # join the tokens into a single string
  text = ' '.join(text)
  return text

train['Text'] = train['Text'].apply(clean_text)
test['Text'] = test['Text'].apply(clean_text)

Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['Text'])
X_test = vectorizer.transform(test['Text'])

# Split the train data into features (X) and target (y)
y_train = train['Label']

# Train the Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = nb.predict(X_test)

# Save the predictions to a CSV file
df = pd.DataFrame({'Id': test['Id'], 'Predicted': test_predictions})
df.to_csv('/content/drive/My Drive/DataAnalytics/Output/Output_1/results_naive_bayes.csv', index=False)

SVM

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Vectorize the texts using Tfidf
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['Text'])
X_test = vectorizer.transform(test['Text'])

# Split the train data into features (X) and target (y)
y_train = train['Label']

# Train the SVM classifier
svm = LinearSVC()
svm.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = svm.predict(X_test)

# Save the predictions to a CSV file
df = pd.DataFrame({'Id': test['Id'], 'Predicted': test_predictions})
df.to_csv('/content/drive/My Drive/DataAnalytics/Output/Output_1/results_svm.csv', index=False)


Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the texts using Tfidf
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['Text'])
X_test = vectorizer.transform(test['Text'])
# Define the model
model = DecisionTreeClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = model.predict(X_test)

# Save the predictions to a CSV file
df = pd.DataFrame({'Id': test['Id'], 'Predicted': test_predictions})
df.to_csv('/content/drive/My Drive/DataAnalytics/Output/Output_1/results_tree.csv', index=False)


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the texts using Tfidf
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['Text'])
X_test = vectorizer.transform(test['Text'])

# Split the train data into features (X) and target (y)
y_train = train['Label']

# Define the model
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, random_state=0)

# Fit the model on the training data
mlp.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = mlp.predict(X_test)

# Save the predictions to a CSV file
df = pd.DataFrame({'Id': test['Id'], 'Predicted': test_predictions})
df.to_csv('/content/drive/My Drive/DataAnalytics/Output/Output_1/results_mlp.csv', index=False)



Keras

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
import numpy as np
# Convert the text data into numerical features using one-hot encoding
tokenizer = keras.preprocessing.text.Tokenizer(num_words=3000)
tokenizer.fit_on_texts(train['Text'])
X_train = tokenizer.texts_to_matrix(train['Text'])
X_test = tokenizer.texts_to_matrix(test['Text'])

# Encode the target labels
le = LabelEncoder()
y_train = le.fit_transform(train['Label'])
num_classes = len(set(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)

# Define the model architecture
model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(3000,)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Make predictions on the test set
test_predictions = model.predict(X_test)
test_predictions = np.argmax(test_predictions, axis=1)
test_predictions = le.inverse_transform(test_predictions)

# Save the predictions to a CSV file
df = pd.DataFrame({'Id': test['Id'], 'Predicted': test_predictions})
df.to_csv('/content/drive/My Drive/DataAnalytics/Output/Output_1/results_neural_network.csv', index=False)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
