### Obtaining Testing and Training data

In [1]:
import numpy as np
import pandas as pd
import os

test = pd.read_csv("../test.csv")
train = pd.read_csv("../train.csv")

train_x = train[['qid', 'question_text']]
train_y = train['target']

In [2]:
train_x.head()

Unnamed: 0,qid,question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco..."
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...


## Data Preprocessing

In [3]:
train_x.loc[:, 'question_text'] = train_x['question_text'].astype(str).str.lower()
train_x.loc[:, 'question_text'] = train_x['question_text'].str.replace(r'\d+', '', regex=True).str.replace('[^\w\s]','')

  train_x.loc[:, 'question_text'] = train_x['question_text'].str.replace(r'\d+', '', regex=True).str.replace('[^\w\s]','')


## Train and Validation split

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Vectorize Sentences

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
train_x_vec = vectorizer.fit_transform(x_train['question_text'])
val_x_vec = vectorizer.transform(x_val['question_text'])
test_vec = vectorizer.transform(test['question_text'])

In [6]:
print(x_train.shape, train_x_vec.shape, x_val.shape, val_x_vec.shape, test.shape, test_vec.shape)

(1044897, 2) (1044897, 10000) (261225, 2) (261225, 10000) (375806, 2) (375806, 10000)


## Train Logistic Regression Model

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=500)
model.fit(train_x_vec, y_train)

## Prediction and Test Accuracy

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(val_x_vec)

accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_val, y_pred))

Validation Accuracy: 0.9537
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98    245369
           1       0.69      0.42      0.53     15856

    accuracy                           0.95    261225
   macro avg       0.83      0.71      0.75    261225
weighted avg       0.95      0.95      0.95    261225



### Sumission

In [12]:
def submit():
    submission = test[['qid']].copy() 
    prediction = model.predict(test_vec)
    submission['prediction'] = prediction 
    submission.to_csv('submission.csv', index=None)
    return submission

submit()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
...,...,...
375801,ffff7fa746bd6d6197a9,0
375802,ffffa1be31c43046ab6b,0
375803,ffffae173b6ca6bfa563,0
375804,ffffb1f7f1a008620287,0
