In [21]:
# First, import all the libraries.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from sklearn.svm import SVC

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\12har\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Load your dataset
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data Preprocessing
# Fill missing values
df['location'].fillna('No Location', inplace=True)
df['keyword'].fillna('No Keyword', inplace=True)

# Text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', ' ', text)  # remove urls
    text = re.sub(r'<.*?>', ' ', text)    # remove html tags
    text = re.sub(r'\d+', ' ', text)      # remove digits
    text = re.sub(r'#\w+', ' ', text)     # remove hashtags
    text = re.sub(r'@\w+', ' ', text)     # remove mentions
    text = re.sub(r'\s+', ' ', text)      # remove extra whitespace
    text = text.lower()                   # convert to lowercase
    return text

# Apply text cleaning function
df['text'] = df['text'].apply(clean_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

In [14]:
# 1. Simple Logistic Regression model with TF-IDF for text vectorization
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', LogisticRegression(solver='liblinear')),
])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7925147734734077
Confusion Matrix:
 [[771 103]
 [213 436]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.88      0.83       874
           1       0.81      0.67      0.73       649

    accuracy                           0.79      1523
   macro avg       0.80      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



In [24]:
# 1-2. logistic regression model with feature engineering
from sklearn.preprocessing import OneHotEncoder

# One-Hot Encoding for 'keyword' and 'location'
encoder = OneHotEncoder(sparse=False)
encoded_keywords = encoder.fit_transform(df[['keyword']])
encoded_locations = encoder.fit_transform(df[['location']])

from scipy.sparse import hstack

tfidf = TfidfVectorizer(stop_words=stopwords.words('english'))
tfidf_text = tfidf.fit_transform(df['text'])

# Combine TF-IDF with encoded keyword and location
combined_features = hstack([tfidf_text, encoded_keywords, encoded_locations])

X_train, X_test, y_train, y_test = train_test_split(combined_features, df['target'], test_size=0.2, random_state=42)

# Logistic Regression
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.7734734077478661


In [15]:
# 2. Naive Bayes

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', MultinomialNB()),
])

pipeline_nb.fit(X_train, y_train)
y_pred_nb = pipeline_nb.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Accuracy: 0.7918581746552856
Confusion Matrix:
 [[782  92]
 [225 424]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.89      0.83       874
           1       0.82      0.65      0.73       649

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.78      1523
weighted avg       0.80      0.79      0.79      1523



In [16]:
# 3. Support Vector Machine

pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', SVC(kernel='linear')),
])

pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))
print("Classification Report:\n", classification_report(y_test, y_pred_svc))

Accuracy: 0.7879185817465528
Confusion Matrix:
 [[743 131]
 [192 457]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       874
           1       0.78      0.70      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



In [17]:
# 4. Random Forest

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', RandomForestClassifier(n_estimators=100)),
])

pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.7603414313854235
Confusion Matrix:
 [[685 189]
 [176 473]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79       874
           1       0.71      0.73      0.72       649

    accuracy                           0.76      1523
   macro avg       0.76      0.76      0.76      1523
weighted avg       0.76      0.76      0.76      1523



In [18]:
# 5. Deep Learning - LSTM (Long Short-Term Memory)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

# Build LSTM model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 100))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, batch_size=64, epochs=10, validation_data=(X_test_pad, y_test))

# Predictions
y_pred_lstm = (model.predict(X_test_pad) > 0.5).astype("int32")

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred_lstm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm))
print("Classification Report:\n", classification_report(y_test, y_pred_lstm))




Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7518056467498359
Confusion Matrix:
 [[711 163]
 [215 434]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.81      0.79       874
           1       0.73      0.67      0.70       649

    accuracy                           0.75      1523
   macro avg       0.75      0.74      0.74      1523
weighted avg       0.75      0.75      0.75      1523



#### Now that we've checked all the models accuracy, pick the highest model and train again without splitting train/test.

In [26]:
# Simple Logistic Regression model with TF-IDF for text vectorization

X_train = df['text']
y_train = df['target']
X_test = test['text']

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', LogisticRegression(solver='liblinear')),
])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
y_pred = pipeline.predict(X_test)


In [32]:
result = pd.DataFrame(test['id'])
result['target'] = y_pred
result
result.to_csv('predicted_targets.csv', index=False)

Score: 0.78792