In [1]:
import numpy as np
import pandas as pd
import re
import warnings
import time
import os

warnings.filterwarnings("ignore")

In [2]:
def preprocessing(row_csv, trainData_csv, testData_csv):
    """
        row_csv:           a csv file containing target column index.
        trainData_csv:     training dataset, csv file.
        testData_csv:      testing dataset, csv file.
        
    """
    
    # display
    print("Part I: Preprocessing")
    print("Start preprocessing...")
    
    # create dataframe
    try:
        train_df = pd.read_csv(trainData_csv, encoding = "latin1")
    except:
        print("No data detected! Please put this .py at the root directory, such as the netid directory.")
        os._exit(0)
    else:
        print("Train Data Loading Success.")
    
    try:
        test_df = pd.read_csv(testData_csv)
    except:
        testData_csv = None
        print("Detected no test data!")
    else:
        print("Test Data Loading Success.")
        
    try:
        row = pd.read_csv(row_csv)
    except:
        print("Missing row.csv!")
        os._exit(0)
    else:
        print("Row.csv Loading Success.")
        
    target_index = int(row["targetIndex"])
    
    # get features
    features = list(train_df.columns)
    
    # target column
    target_col = [features[target_index]]
    target = train_df[target_col[0]]
    features.remove(target_col[0])
    train_df.drop(target_col, axis = 1, inplace = True)
    
    # combine train and test
    combined_train_test = [train_df, test_df] if testData_csv else [train_df]
    
    # replace spaces with null value
    # if too much nan value, we can drop the feature
    for dataset in combined_train_test:
        for feature in dataset.columns:
            dataset[feature] = dataset[feature].replace(" ", np.nan)
            dataset[feature] = dataset[feature].replace("?", np.nan)
            if dataset[feature].isna().sum()/dataset.shape[0] >= .5:
                dataset.drop([feature], axis = 1, inplace = True)
                
    features = list(train_df.columns)
    
    # numerical columns
    numerical_cols = list(train_df.select_dtypes(exclude = "object").columns)
    numerical_cols = [x for x in numerical_cols if train_df[x].nunique() >= 6]
    
    # categorical columns
    categorical_cols = [x for x in features if not x in numerical_cols]
    
    # binary columns
    bin_cols = [x for x in categorical_cols if train_df[x].nunique()/train_df.shape[0] <= .1]
    
    # multivalues columns
    mul_cols = [x for x in categorical_cols if not x in bin_cols]
    
    # fill in missing values
    for dataset in combined_train_test:
        for feature in numerical_cols:
            dataset[feature].fillna(dataset[feature].median(), inplace = True)
        for feature in categorical_cols:
            dataset[feature].fillna(dataset[feature].mode()[0], inplace = True)
    
    # normalization and label encode
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    scaler, encoder = StandardScaler(), LabelEncoder()
    for dataset in combined_train_test:
        for feature in numerical_cols:
            dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1, 1))
        for feature in bin_cols:
            dataset[feature] = encoder.fit_transform(dataset[feature])
    target = encoder.fit_transform(target)

    # pack dataset for next step
    if len(combined_train_test) == 1:
        data = [train_df, target]
    else:
        data = [train_df, target, test_df]
    cols = [numerical_cols, bin_cols, mul_cols, target_col]
    
    # display
    print("Preprocessing Done!\n")
    print("We have the target feature ", target_col, ".\n")
    print("The categorical features are", categorical_cols, "and numerical features are", numerical_cols, ".\n")
    print("Specifically in categorical features, we have features", 
          bin_cols, ", which have several unique values",
          " and", mul_cols, ", which have plenty of unique values.\n")
    
    return data, cols

In [3]:
def feature_extraction(data, cols):
    """
        data:    a pack containing training set or/and testing set.
        cols:    a list containing types of feature labels.
        
    """
    
    # display
    print("Part II: Feature Extraction")
    print("Start feature extraction...")
    
    # unpack data
    train_df, target = data[0], data[1]
    if len(data) == 3:
        test_df = data[2]
        combined_train_test = [train_df, test_df]
    else:
        combined_train_test = [train_df]
    numerical_cols, target_col = cols[0], cols[3]
    bin_cols, mul_cols = cols[1], cols[2]
    
    # if name feature exists, extract information from name
    def get_title(name):
        title_search = re.search("([A-Za-z]+)\.", name)
        if title_search:
            return title_search.group(1)
        return ""
    
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    NameFeature = []
    for feature in mul_cols:
        if "name" == feature.lower():
            NameFeature.append(feature)
            mul_cols.remove(feature)
    
    NewFeature = []
    for feature in NameFeature:
        name = feature + "_Info"
        NewFeature.append(name)
        for dataset in combined_train_test:
            dataset[name] = dataset[feature].apply(get_title)
            dataset[name] = dataset[name].replace(["Sir", "Lady", "Don", "Rev", "Col", "Capt", "Countess", 
                                                   "Jonkheer", "Major", "Dr", "Dona"], "Others")
            dataset[name] = dataset[name].replace(["Ms"], "Mrs")
            dataset[name] = dataset[name].replace(["Mme", "Mlle"], "Miss")
            dataset[name] = encoder.fit_transform(dataset[name])
            dataset.drop([feature], axis = 1, inplace = True)
        bin_cols.append(name)
    
    # pack data for next step
    if len(combined_train_test) == 1:
        data = [train_df, target]
    else:
        data = [train_df, target, test_df]
    cols = [numerical_cols, bin_cols, mul_cols, target_col]
    
    # display
    print("Feature Extraction Done!\n")
    if not NewFeature:
        print("We don't get new features.")
    else:
        print("We get new features", NewFeature, "and replace the original features respectively.\n")
    
    return data, cols

In [4]:
def feature_selection(data, cols):
    """
        data:    a pack containing training set or/and testing set.
        cols:    a list containing types of feature labels.
        
    """
    
    # display
    print("Part III: Feature Selection")
    print("Start feature selection...")
    
    # unpack data
    train_df, target = data[0], data[1]
    if len(data) == 3:
        test_df = data[2]
        combined_train_test = [train_df, test_df]
    else:
        combined_train_test = [train_df]
    numerical_cols, target_col = cols[0], cols[3]
    bin_cols, mul_cols = cols[1], cols[2]
    
    # drop id columns
    IdFeature = []
    for i in [numerical_cols, bin_cols, mul_cols]:
        for feature in i:
            if "id" in feature.lower():
                IdFeature.append(feature)
                i.remove(feature)
    
    for feature in IdFeature:
        for dataset in combined_train_test:
            dataset.drop([feature], axis = 1, inplace = True)
    
    # drop columns with same value
    SameValueCol = []
    for i in [numerical_cols, bin_cols, mul_cols]:
        for feature in i:
            for dataset in combined_train_test:
                if (dataset[feature].nunique()) == 1:
                    SameValueCol.append(feature)
                    i.remove(feature)
                    dataset.drop([feature], axis = 1, inplace = True)
    
   
    # drop all multivalue columns
    for feature in mul_cols:
        for dataset in combined_train_test:
            dataset.drop([feature], axis = 1, inplace = True)
    mul_cols = []
    
    
    # drop similar feature
    features = [i for i in numerical_cols] + [j for j in bin_cols] + target_col
    df = pd.concat([data[0], data[2]]) if len(data) == 3 else data[0]
    
    for feature in features:
        elements = feature.lower().split(" ")
        if "abbreviation" in elements:
            df.drop([feature], axis = 1, inplace = True)
            features.remove(feature)
        if "code" in elements:
            tmp = feature.split(" ")[0]
            if tmp in features:
                df.drop([tmp], axis = 1, inplace = True)
                features.remove(tmp)
    
    features = list(df.columns)

    # if too many features, do SelectKBest
    from sklearn.feature_selection import SelectKBest, f_classif 
    if len(features) > 10:
        print("Too many features, do SelectKBest.")
        selector = SelectKBest(f_classif, k = 10)
        train_df = selector.fit_transform(train_df[features], target)
        scores = -np.log10(selector.pvalues_)
        indices = np.argsort(scores)[::-1]
        new_features = []
        for f in range(10):
            new_features.append(features[indices[f]])
    
    # pack dataset for next step
    if len(combined_train_test) == 1:
        data = [train_df, target]
    else:
        data = [train_df, target, test_df]
    
    # display
    print("Feature Selection Done!\n")
    if len(features) > 10:
        print("Finally, the feature selected for estimation are", new_features, ".\n")
    else:
        print("Finally, the feature selected for estimation are", features, ".\n")
    
    return data

In [5]:
def estimation(row_csv, data):
    """
        row_csv: a csv file containing target column index.
        data:    a pack containing training set or/and testing set.
        
    """
    
    # display
    print("Part IV: Estimation")
    print("Start estimation...")
    
    # import library
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.metrics import confusion_matrix
    from sklearn.linear_model import LogisticRegression
    
    # unpack data
    train_df, target = data[0], data[1]
    if len(data) == 3:
        test_df = data[2]
    
    # train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(train_df, target, test_size = 0.3)
    
    # get classifier
    row = pd.read_csv(row_csv)
    code = row["estimator3"].values[0]
    try:
        clf = eval(code)
    except:
        print("\nThere is something wrong with 'estimator3' in your row.csv file, please check again!")
        os.exit(0)
    else:
        print("\nLoad estimator success!")
    
    # cross_validation
    score_type = row["performanceMetric"].values[0]
    scores = cross_val_score(clf, train_df, target, cv = 5, scoring = score_type)
    
    # display
    print("Estimation Done!\n")
    print("Classifier is " + code + "and the performance metric is:", score_type)
    print("\nScore: %0.3f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'Classifier Cross Validation'))
    
    
    

In [6]:
def postprocessing():
    pass

In [24]:
def main(row_csv, trainData_csv, testData_csv = None):
    """
        row_csv:           a csv file containing target column index.
        trainData_csv:     training dataset, csv file.
        testData_csv:      testing dataset, csv file.
        
    """
    
    #
    start_time = time.time()
    
    data, cols = preprocessing(row_csv, trainData_csv, testData_csv)
    data, cols = feature_extraction(data, cols)
    data = feature_selection(data, cols)
    estimation(row_csv, data)
    postprocessing()
    
    print("--- Total runnung time %s seconds ---" % (time.time() - start_time))
    
    return 

if __name__ == "__main__":
    main("submission/row.csv", "data/trainData.csv", "data/testData.csv")
    #main("Medical Appointment/submission/row.csv", "Medical Appointment/data/trainData.csv", "Medical Appointment/data/testData.csv")

Part I: Preprocessing
Start preprocessing...
Train Data Loading Success.
Detected no test data!
Row.csv Loading Success.
Preprocessing Done!

We have the target feature  ['No-show'] .

The categorical features are ['Gender', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received'] and numerical features are ['PatientId', 'AppointmentID', 'Age'] .

Specifically in categorical features, we have features ['Gender', 'AppointmentDay', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received'] , which have several unique values  and ['ScheduledDay'] , which have plenty of unique values.

Part II: Feature Extraction
Start feature extraction...
Feature Extraction Done!

We don't get new features.
Part III: Feature Selection
Start feature selection...
Too many features, do SelectKBest.
Feature Selection Done!

Finally, the feature selected for estimation are ['SMS_received', '