### Obtaining Testing and Training data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))

sample_submission = pd.read_csv(paths[0])
test = pd.read_csv(paths[3])
train = pd.read_csv(paths[2])

# Split training data into features (X) and target (y)
train_x = train[['qid', 'question_text']]  # Feature columns
train_y = train['target']  # Target column

In [None]:
# Display first few rows for verification
train_x.head()

## Data Preprocessing

In [None]:
# ensuring all data is lower cased, has no punctuation and removes all numbers
train_x.loc[:, 'question_text'] = train_x['question_text'].astype(str).str.lower()
train_x.loc[:, 'question_text'] = train_x['question_text'].str.replace(r'\d+', '', regex=True).str.replace('[^\w\s]','')

## Train and Validation split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Vectorize Sentences

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
# edit max_features to change accuracy and sparsity
vectorizer = TfidfVectorizer(max_features=10000)
train_x_vec = vectorizer.fit_transform(x_train['question_text'])
val_x_vec = vectorizer.transform(x_val['question_text'])
test_vec = vectorizer.transform(test['question_text'])

In [None]:
print(x_train.shape, train_x_vec.shape, x_val.shape, val_x_vec.shape, test.shape, test_vec.shape)

## Train Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Default 100 iterations was not enough
model = LogisticRegression(max_iter=500)
model.fit(train_x_vec, y_train)

## Prediction and Test Accuracy

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predictions
y_pred = model.predict(val_x_vec)

# Accuracy reporting
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_val, y_pred))

### Sumission

In [None]:
def submit():
    submission = test[['qid']].copy() 
    prediction = model.predict(test_vec)
    submission['prediction'] = prediction 
    submission.to_csv('submission.csv', index=None)
    return submission

submit()