In [None]:
from google.colab import files
uploaded = files.upload()

Saving hacktrain.csv to hacktrain.csv
Saving hacktest.csv to hacktest.csv


In [None]:

import pandas as pd  
import numpy as np  

# machine learning tools from sklearn
from sklearn.impute import KNNImputer  # to fill missing values using k-nearest neighbors
from sklearn.linear_model import LogisticRegression  # the ML model
from sklearn.model_selection import StratifiedKFold, cross_val_score  # for evaluation
from sklearn.metrics import classification_report  # for model report (not used below)
from sklearn.preprocessing import LabelEncoder  # to convert labels to numbers
from sklearn.decomposition import PCA  # for reducing number of features (dimensionality)
from scipy.stats import skew, kurtosis  # to calculate skewness and kurtosis (data shape info)

# loading datasets
train = pd.read_csv("hacktrain.csv")
test = pd.read_csv("hacktest.csv")

# dropping unwanted cols like 'Unnamed: 0' and 'ID' if present
for df in [train, test]:
    df.drop(columns=[col for col in ['Unnamed: 0', 'ID'] if col in df.columns], inplace=True, errors='ignore')

# saving test ids separately.. for submission file
test_ids = pd.read_csv("hacktest.csv")['ID']

# separating features (X) and labels (y) from training data
X_raw = train.drop(columns=['class'])  # input features
y = train['class']  # target labels
X_test_raw = test.copy()  # test data copy

# function to create extra features from raw data
def create_features(df):
    feats = pd.DataFrame(index=df.index)  # creating empty dataframe with same index

    #row-wise stats for each sample
    feats['ndvi_mean'] = df.mean(axis=1)  # avg of all columns in 1 row
    feats['ndvi_std'] = df.std(axis=1)  # standard deviation row wise
    feats['ndvi_max'] = df.max(axis=1)  # max val in each row
    feats['ndvi_min'] = df.min(axis=1)  # min val in each row
    feats['ndvi_nan'] = df.isna().sum(axis=1)  # num of missing values in each row
    feats['ndvi_skew'] = df.skew(axis=1)  # skewness => data is tilted
    feats['ndvi_kurtosis'] = df.kurtosis(axis=1)  # kurtosis => how peaky or flat data is

    # trend nikal rahe hai using linear fit (slope of the line through the row values)
  # polyfit fits a straight line: y = mx + c ===> returns m and c  we only use m (the slope)
    # row.isna() means where data is not null (valid)
    x = np.arange(df.shape[1])  # x = [0,1,2,...number of columns]
    feats['ndvi_trend'] = df.apply(
        lambda row: np.polyfit(x[~row.isna()], row.dropna(), 1)[0] if row.notna().sum() > 1 else 0,
        axis=1
    )
    return feats

#features for both train nd test sets
X_feat = create_features(X_raw)
X_test_feat = create_features(X_test_raw)

# using PCA to reduce feature space like keeping 5 imp directions
# pca => too many features => reduce size
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_raw.fillna(0))  # filling missing with 0 before PCA
X_test_pca = pca.transform(X_test_raw.fillna(0))  # test data also transformed

# final feature
X_final = np.concatenate([X_feat.values, X_pca], axis=1)
X_test_final = np.concatenate([X_test_feat.values, X_test_pca], axis=1)

# using knn imputer to fill missing values in final feature set if any
imputer = KNNImputer(n_neighbors=3)
X_imputed = imputer.fit_transform(X_final)
X_test_imputed = imputer.transform(X_test_final)

# encoding class labels (A,B) to like numbers ( 0, 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# defining logistic regression model with some parameters
model = LogisticRegression(
    multi_class='multinomial',  # because we have more than 2 classes
    solver='lbfgs',  # TO optimize
    max_iter=500,  # increasing iterations - model gets time to learn
    C=0.5,  # regularization strength (smaller = more regularization)
    random_state=42  # fixed
)

#using stratified k-fold to evaluate model fairly ===> (equal class ratio in every fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_imputed, y_encoded, cv=cv, scoring='accuracy')

#average accuracy from crossvalidation
print(f"Stratified 5-Fold CV Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

# training model on full training data
model.fit(X_imputed, y_encoded)

#predictions on test data
y_test_pred = model.predict(X_test_imputed)

#converting predicted nums back to original class labels
y_test_labels = le.inverse_transform(y_test_pred)

#final submission
submission = pd.DataFrame({
    'ID': test_ids,
    'class': y_test_labels
})
submission.to_csv("submission.csv", index=False)
print(" submission.csv created successfully!")
