In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Section 1: Data Loading

In [104]:
def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path, encoding='ISO-8859-1')
    test_data = pd.read_csv(test_path, encoding='ISO-8859-1')
    return train_data, test_data

In [105]:
train_path = 'store_train.csv'
test_path = 'store_test.csv'
output_path = 'store_predictions.csv'

# Load the data
train_data, test_data = load_data(train_path, test_path)

In [106]:
train_data.head()

Unnamed: 0,Id,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,countyname,storecode,Areaname,countytownname,population,state_alpha,store_Type,store
0,2300919770,848,588,666,1116,1133,9,23,19770,Hancock County,NCNTY23009N23009,"Hancock County, ME",Eastbrook town,423.0,ME,Supermarket Type1,0
1,5000129575,925,717,780,1283,1550,1,50,29575,Addison County,NCNTY50001N50001,"Addison County, VT",Granville town,298.0,VT,Supermarket Type1,0
2,2501308470,924,616,739,1154,1314,13,25,8470,Hampden County,METRO44140M44140,"Springfield, MA HUD Metro FMR Area",Brimfield town,3609.0,MA,Supermarket Type1,1
3,603599999,924,646,683,1292,1297,35,6,99999,Lassen County,NCNTY06035N06035,"Lassen County, CA",Lassen County,34895.0,CA,Supermarket Type3,0
4,5002760100,1017,730,735,1208,1326,27,50,60100,Windsor County,NCNTY50027N50027,"Windsor County, VT",Rochester town,1139.0,VT,Supermarket Type1,0


In [107]:
test_data.head()

Unnamed: 0,Id,sales0,sales1,sales2,sales3,sales4,country,State,CouSub,countyname,storecode,Areaname,countytownname,population,state_alpha,store_Type
0,101799999,696,511,514,867,1034,17.0,1,99999,Chambers County,NCNTY01017N01017,"Chambers County, AL",Chambers County,34215.0,AL,Supermarket Type1
1,101999999,599,481,500,883,894,19.0,1,99999,Cherokee County,NCNTY01019N01019,"Cherokee County, AL",Cherokee County,25989.0,AL,Supermarket Type1
2,102199999,599,423,475,802,1061,21.0,1,99999,Chilton County,METRO13820N01021,"Chilton County, AL HUD Metro FMR Area",Chilton County,43643.0,AL,Supermarket Type1
3,103599999,599,459,462,883,886,35.0,1,99999,Conecuh County,NCNTY01035N01035,"Conecuh County, AL",Conecuh County,13228.0,AL,Supermarket Type1
4,103799999,599,481,505,746,801,37.0,1,99999,Coosa County,NCNTY01037N01037,"Coosa County, AL",Coosa County,11539.0,AL,Supermarket Type3


# Section 2: Data Preprocessing

In [108]:
def preprocess_data(train_data, test_data):
    # Drop unnecessary columns
    train_data = train_data.drop(columns=['Id', 'State', 'Areaname', 'countytownname'])
    test_ids = test_data['Id']
    test_data = test_data.drop(columns=['Id', 'State', 'Areaname', 'countytownname'])

    # Separate features and target variable from training data
    X = train_data.drop(columns=['store'])
    y = train_data['store']

    # Convert countries to category type
    X['country'] = X['country'].astype('category')
    test_data['country'] = test_data['country'].astype('category')

    # Replace 99999 with NaN in CouSub
    X['CouSub'] = X['CouSub'].replace(99999, np.nan)
    test_data['CouSub'] = test_data['CouSub'].replace(99999, np.nan)

    # Feature engineering for 'storecode'
    X['storecode_alphabets'] = X['storecode'].apply(lambda x: ''.join(filter(str.isalpha, str(x))))
    test_data['storecode_alphabets'] = test_data['storecode'].apply(lambda x: ''.join(filter(str.isalpha, str(x))))

    # Drop the original 'storecode' as it should not be used as is
    X = X.drop(columns=['storecode'])
    test_data = test_data.drop(columns=['storecode'])

    # Impute missing values for population and replace population values less than 100 with NaN
    X['population'] = X['population'].apply(lambda x: np.nan if x < 100 else x)
    test_data['population'] = test_data['population'].apply(lambda x: np.nan if x < 100 else x)

    # Impute missing values for population using median strategy
    imputer = SimpleImputer(strategy='median')
    X['population'] = imputer.fit_transform(X[['population']])
    test_data['population'] = imputer.transform(test_data[['population']])
    X['CouSub'] = imputer.fit_transform(X[['CouSub']])
    test_data['CouSub'] = imputer.transform(test_data[['CouSub']])

    # Combine train and test data for consistent preprocessing
    combined_data = pd.concat([X, test_data], keys=['train', 'test'])

    # One-hot encode categorical variables
    categorical_cols = combined_data.select_dtypes(include=['object', 'category']).columns
    combined_data = pd.get_dummies(combined_data, columns=categorical_cols)

    # Split combined data back into train and test sets
    X = combined_data.xs('train')
    test_data = combined_data.xs('test')

    return X, y, test_data, test_ids


In [109]:
X, y, test_data, test_ids = preprocess_data(train_data, test_data)

In [110]:
X.head()

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,CouSub,population,countyname_Abbeville County,countyname_Acadia Parish,...,state_alpha_WV,state_alpha_WY,store_Type_Grocery Store,store_Type_Supermarket Type1,store_Type_Supermarket Type2,store_Type_Supermarket Type3,storecode_alphabets_METROM,storecode_alphabets_METROMM,storecode_alphabets_METRON,storecode_alphabets_NCNTYN
0,848,588,666,1116,1133,9.0,19770.0,423.0,False,False,...,False,False,False,True,False,False,False,False,False,True
1,925,717,780,1283,1550,1.0,29575.0,298.0,False,False,...,False,False,False,True,False,False,False,False,False,True
2,924,616,739,1154,1314,13.0,8470.0,3609.0,False,False,...,False,False,False,True,False,False,True,False,False,False
3,924,646,683,1292,1297,35.0,42460.0,34895.0,False,False,...,False,False,False,False,False,True,False,False,False,True
4,1017,730,735,1208,1326,27.0,60100.0,1139.0,False,False,...,False,False,False,True,False,False,False,False,False,True


In [76]:
y.head()

Unnamed: 0,store
0,0
1,0
2,1
3,0
4,0


In [77]:
test_data.head()

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,CouSub,population,countyname_Abbeville County,countyname_Acadia Parish,...,state_alpha_WV,state_alpha_WY,store_Type_Grocery Store,store_Type_Supermarket Type1,store_Type_Supermarket Type2,store_Type_Supermarket Type3,storecode_alphabets_METROM,storecode_alphabets_METROMM,storecode_alphabets_METRON,storecode_alphabets_NCNTYN
0,696,511,514,867,1034,17.0,42460.0,34215.0,False,False,...,False,False,False,True,False,False,False,False,False,True
1,599,481,500,883,894,19.0,42460.0,25989.0,False,False,...,False,False,False,True,False,False,False,False,False,True
2,599,423,475,802,1061,21.0,42460.0,43643.0,False,False,...,False,False,False,True,False,False,False,False,True,False
3,599,459,462,883,886,35.0,42460.0,13228.0,False,False,...,False,False,False,True,False,False,False,False,False,True
4,599,481,505,746,801,37.0,42460.0,11539.0,False,False,...,False,False,False,False,False,True,False,False,False,True


In [78]:
test_ids.head()

Unnamed: 0,Id
0,101799999
1,101999999
2,102199999
3,103599999
4,103799999


# Section 3: Train-Validation Split

In [79]:
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_val, y_train, y_val

In [80]:
X_train, X_val, y_train, y_val = split_data(X, y)

In [81]:
X_train.head()

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,CouSub,population,countyname_Abbeville County,countyname_Acadia Parish,...,state_alpha_WV,state_alpha_WY,store_Type_Grocery Store,store_Type_Supermarket Type1,store_Type_Supermarket Type2,store_Type_Supermarket Type3,storecode_alphabets_METROM,storecode_alphabets_METROMM,storecode_alphabets_METRON,storecode_alphabets_NCNTYN
1117,847,504,672,1055,1132,15.0,48645.0,1752.0,False,False,...,False,False,False,True,False,False,False,False,False,True
3102,922,743,748,1182,1232,13.0,51620.0,355.0,False,False,...,False,False,True,False,False,False,False,False,False,True
2707,604,399,448,890,893,5.0,42460.0,5685.0,False,False,...,False,False,False,True,False,False,False,False,False,True
1102,772,567,571,961,1169,263.0,42460.0,6865.0,False,False,...,False,False,False,True,False,False,False,False,False,True
2181,631,419,466,878,881,107.0,42460.0,30305.0,False,False,...,False,False,False,False,False,True,False,False,False,True


In [82]:
X_val.head()

Unnamed: 0,sales0,sales1,sales2,sales3,sales4,country,CouSub,population,countyname_Abbeville County,countyname_Acadia Parish,...,state_alpha_WV,state_alpha_WY,store_Type_Grocery Store,store_Type_Supermarket Type1,store_Type_Supermarket Type2,store_Type_Supermarket Type3,storecode_alphabets_METROM,storecode_alphabets_METROMM,storecode_alphabets_METRON,storecode_alphabets_NCNTYN
1706,593,402,450,788,792,93.0,42460.0,7089.0,False,False,...,False,False,False,True,False,False,False,False,False,True
3096,643,383,475,876,1139,51.0,42460.0,5916.0,False,False,...,False,False,False,True,False,False,False,False,False,True
729,586,473,494,805,967,11.0,42460.0,18395.0,False,False,...,False,False,False,True,False,False,False,False,False,True
3142,714,551,561,946,1062,43.0,42460.0,64519.0,False,False,...,False,False,False,True,False,False,True,False,False,False
997,927,678,732,1199,1461,11.0,51265.0,7839.0,False,False,...,False,False,False,True,False,False,False,False,True,False


In [83]:
y_train.head()

Unnamed: 0,store
1117,0
3102,0
2707,1
1102,0
2181,0


In [84]:
y_val.head()

Unnamed: 0,store
1706,0
3096,0
729,0
3142,1
997,0


# Section 4: Model Training Pipeline

In [85]:
def train_model(X_train, y_train):
    rf_model = RandomForestClassifier(random_state=42)

    # Hyperparameter tuning using GridSearchCV
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                               cv=3, n_jobs=-1, scoring='roc_auc')
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_

In [86]:
rf_model = train_model(X_train, y_train)

# Section 5: Model Evaluation

In [87]:
def evaluate_model(model, X_val, y_val):
    val_probs = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, val_probs)
    return auc_score

In [88]:
auc_score = evaluate_model(rf_model, X_val, y_val)
print(f'AUC Score on Validation Set: {auc_score}')

AUC Score on Validation Set: 0.8266792385057471


# Section 6: Predicting Test Data

In [89]:
def predict_test_data(model, test_data):
    test_probs = model.predict_proba(test_data)[:, 1]
    return test_probs

In [96]:
test_data.fillna(0, inplace=True)

In [97]:
test_probs = predict_test_data(rf_model, test_data)

In [98]:
test_probs

array([0.08058475, 0.13328203, 0.85441503, ..., 0.16563669, 0.05489416,
       0.06752917])

In [115]:
assert not pd.isnull(test_probs).any(), "There are NA values in the predictions."


# Section 7: Saving Predictions

In [116]:
def save_predictions(predictions, test_ids, output_path):
    output = pd.DataFrame({'Id': test_ids, 'store': predictions})
    output.to_csv(output_path, index=False)
    print(f"Predictions have been saved to {output_path}")

In [117]:
save_predictions(test_probs, test_ids, output_path)

Predictions have been saved to store_predictions.csv
